Beispiel #1
0
def prepare_training_data():
    word_ranks = get_word_ranks()
    corpus = get_corpus()
    segmenter = Segmenter()
    with codecs.open('./data/training.txt', 'w', 'utf-8') as fout:
        for poem in corpus:
            poem['keyword'] = []
            stop=False

            for sentence in poem['sentence']:
                segs = list(filter(lambda seg: seg in word_ranks, segmenter.segment(sentence)))
                if len(segs) == 0:
                    stop = True
                    break
            if len(poem['sentence'])!=4 or stop:
                continue
            for sentence in poem['sentence']:
                segs = list(filter(lambda seg: seg in word_ranks, segmenter.segment(sentence)))
                if len(segs) == 0:
                    print('aaa', sentence)
                keyword = reduce(lambda x,y: x if word_ranks[x]>word_ranks[y] else y, segs)
                poem['keyword'].append(keyword)
                if(len(keyword)>=2):
                    print(sentence, keyword)
                fout.write(sentence + '\t' + keyword + '\n')
Beispiel #2
0
def get_text_ranks():
    segmenter = Segmenter()
    stopwords = get_stopwords()
    print("Start TextRank over the selected quatrains ...")
    corpus = get_corpus()
    adjlist = dict()
    for idx, poem in enumerate(corpus):
        if 0 == (idx + 1) % 10000:
            print("[TextRank] Scanning %d/%d poems ..." %
                  (idx + 1, len(corpus)))
        for sentence in poem['sentence']:
            segs = list(
                filter(lambda word: word not in stopwords,
                       segmenter.segment(sentence)))
            for seg in segs:
                if seg not in adjlist:
                    adjlist[seg] = dict()

            for i, seg in enumerate(segs):
                for _, other in enumerate(segs[i + 1:]):
                    if seg != other:
                        adjlist[seg][other] = adjlist[seg][other] + 1 \
                            if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg] + 1 \
                            if seg in adjlist[other] else 1.0

    for word in adjlist:
        w_sum = sum(weight for other, weight in adjlist[word].items())
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum
    print("[TextRank] Weighted graph has been built.")
    _text_rank(adjlist)
Beispiel #3
0
class Renamer():

    lower_list = [
        "a", "an", "the", "and", "but", "or", "for", "nor", "with", "to", "on",
        "as", "at", "by", "in", "of", "mid", "off", "per", "qua", "re", "up",
        "via", "o'", "'n'", "n'"
    ]

    def __init__(self, app, config=None):

        self.app = app

        ngrams_file = config.get('segmenter', 'ngrams_file')
        self.app.log.debug(f'Trying to load ngrams file {ngrams_file}')
        try:
            with open(ngrams_file, 'r') as nf:
                ngrams = json.load(nf)
                self.app.log.debug(f'Loaded ngrams file {ngrams_file}.')
        except FileNotFoundError as err:
            self.app.log.info(
                f'Ngrams file {ngrams_file} not found. Using default configuration.'
            )

        self._ws = Segmenter(ngrams)

    def suggest_correction(self, filepath):
        filename = os.path.basename(filepath)
        filename, ext = os.path.splitext(filename)

        # Look for ending '2e', '3e' etc giving edition number
        edition_match = re.match('(.*)(\d)e$', filename)
        filename, edition = edition_match.groups() if edition_match else (
            filename, '')

        result_segments = []
        # Process each segment individually
        for token in filename.split('_'):
            words = self._ws.segment(token)
            # To title case
            words = [words[0][:1].upper() + words[0][1:]] + [
                (word[:1].upper() if word not in self.lower_list else word[:1])
                + word[1:] for word in words[1:]
            ]
            result_segments.append(' '.join(words))

        # Join suggestions for segments
        result = " - ".join(result_segments)
        # Add edition information
        if edition:
            result = result + ' ({} edition)'.format(
                num2words(edition, to='ordinal_num'))
        return result + ext