def prepare_training_data(): word_ranks = get_word_ranks() corpus = get_corpus() segmenter = Segmenter() with codecs.open('./data/training.txt', 'w', 'utf-8') as fout: for poem in corpus: poem['keyword'] = [] stop=False for sentence in poem['sentence']: segs = list(filter(lambda seg: seg in word_ranks, segmenter.segment(sentence))) if len(segs) == 0: stop = True break if len(poem['sentence'])!=4 or stop: continue for sentence in poem['sentence']: segs = list(filter(lambda seg: seg in word_ranks, segmenter.segment(sentence))) if len(segs) == 0: print('aaa', sentence) keyword = reduce(lambda x,y: x if word_ranks[x]>word_ranks[y] else y, segs) poem['keyword'].append(keyword) if(len(keyword)>=2): print(sentence, keyword) fout.write(sentence + '\t' + keyword + '\n')
def get_text_ranks(): segmenter = Segmenter() stopwords = get_stopwords() print("Start TextRank over the selected quatrains ...") corpus = get_corpus() adjlist = dict() for idx, poem in enumerate(corpus): if 0 == (idx + 1) % 10000: print("[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(corpus))) for sentence in poem['sentence']: segs = list( filter(lambda word: word not in stopwords, segmenter.segment(sentence))) for seg in segs: if seg not in adjlist: adjlist[seg] = dict() for i, seg in enumerate(segs): for _, other in enumerate(segs[i + 1:]): if seg != other: adjlist[seg][other] = adjlist[seg][other] + 1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg] + 1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum(weight for other, weight in adjlist[word].items()) for other in adjlist[word]: adjlist[word][other] /= w_sum print("[TextRank] Weighted graph has been built.") _text_rank(adjlist)
class Renamer(): lower_list = [ "a", "an", "the", "and", "but", "or", "for", "nor", "with", "to", "on", "as", "at", "by", "in", "of", "mid", "off", "per", "qua", "re", "up", "via", "o'", "'n'", "n'" ] def __init__(self, app, config=None): self.app = app ngrams_file = config.get('segmenter', 'ngrams_file') self.app.log.debug(f'Trying to load ngrams file {ngrams_file}') try: with open(ngrams_file, 'r') as nf: ngrams = json.load(nf) self.app.log.debug(f'Loaded ngrams file {ngrams_file}.') except FileNotFoundError as err: self.app.log.info( f'Ngrams file {ngrams_file} not found. Using default configuration.' ) self._ws = Segmenter(ngrams) def suggest_correction(self, filepath): filename = os.path.basename(filepath) filename, ext = os.path.splitext(filename) # Look for ending '2e', '3e' etc giving edition number edition_match = re.match('(.*)(\d)e$', filename) filename, edition = edition_match.groups() if edition_match else ( filename, '') result_segments = [] # Process each segment individually for token in filename.split('_'): words = self._ws.segment(token) # To title case words = [words[0][:1].upper() + words[0][1:]] + [ (word[:1].upper() if word not in self.lower_list else word[:1]) + word[1:] for word in words[1:] ] result_segments.append(' '.join(words)) # Join suggestions for segments result = " - ".join(result_segments) # Add edition information if edition: result = result + ' ({} edition)'.format( num2words(edition, to='ordinal_num')) return result + ext