def test_default(spacy_doc): result = ke.textrank(spacy_doc) assert isinstance(result, list) and len(result) > 0 assert all(isinstance(ts, tuple) and len(ts) == 2 for ts in result) assert all( isinstance(ts[0], compat.unicode_) and isinstance(ts[1], float) for ts in result )
def test_position_bias(spacy_doc): result1 = ke.textrank(spacy_doc, position_bias=False) result2 = ke.textrank(spacy_doc, position_bias=True) assert len(result1) > 0 and len(result2) > 0 assert result1 != result2
def test_edge_weighting(spacy_doc): result1 = ke.textrank(spacy_doc, edge_weighting="binary") result2 = ke.textrank(spacy_doc, edge_weighting="count") assert len(result1) > 0 and len(result2) > 0 assert result1 != result2
def test_window_size(spacy_doc): result1 = ke.textrank(spacy_doc, window_size=2) result2 = ke.textrank(spacy_doc, window_size=4) assert len(result1) > 0 and len(result2) > 0 assert result1 != result2
def test_topn_float(spacy_doc): result = ke.textrank(spacy_doc, topn=0.2) assert len(result) > 0 with pytest.raises(ValueError): _ = ke.textrank(spacy_doc, topn=2.0)
def test_n_topn(spacy_doc): for n in (5, 25): result = ke.textrank(spacy_doc, topn=n) assert 0 < len(result) <= n
return ax #open data from .txt file with open('news_article.txt', 'r') as file: data = file.read().replace('\n', '') article = data.replace(u'\xa0', u' ') #create doc object doc = textacy.make_spacy_doc(article, lang='en_core_web_sm') #KEYTERM EXTRACTION #Each algorithm returns a list of tuples, containg the keyterm and a score textrank = ke.textrank(doc,normalize="lemma") yake = ke.yake(doc,normalize="lemma") scake = ke.scake(doc,normalize="lemma") sgrank = ke.sgrank(doc,normalize="lemma") #separate terms and relevany scores terms_textrank, scores_textrank = decompose_keyterms(textrank) terms_yake, scores_yake = decompose_keyterms(yake) terms_scake, scores_scake = decompose_keyterms(scake) terms_sgrank, scores_sgrank = decompose_keyterms(sgrank) #save results to dataframe df = keyterm_dataframe(scake,'scake') print(df)
def main(): # We pass these dynamic arguments in for parallel jobs parser = argparse.ArgumentParser( description='Build a phrase dictionary using spaCy and TextaCy') parser.add_argument('--source-tmx', type=str, default='', help='The input tmx file to process') parser.add_argument('--dictionary-path', type=str, default='', help='The input/output path for the phrase dictionary') parser.add_argument('--target-language', type=str, default='', help='es or fr') parser.add_argument( '--category-id', type=str, default='', help='The model against which to build the phrase dictionary') parser.add_argument( '--nlp-id', type=str, default='', help='The source language spacy model e.g. en_core_web_md') parser.add_argument( '--nlp-target', type=str, default='', help='The target language spacy model e.g. fr_core_news_md') parser.add_argument('--batch-start', type=int, default=0, metavar='N', help='start at this number + batch-size') parser.add_argument('--batch-end', type=int, default=100, metavar='N', help='end at this number') args = parser.parse_args() set_log_level(Config.DEBUG) nlp_model_id = load_spacy_model(args.nlp_id) nlp_model_target = load_spacy_model(args.nlp_target) logging.debug(f"Loaded models {args.nlp_id} {args.nlp_target}") phrases = {} tmx_file = load_tmx_file(args.source_tmx) logging.debug(f"Loaded {args.source_tmx}") if os.path.isfile( os.path.join(args.dictionary_path, args.target_language + '_phrase_dictionary.txt')): phrase_file_name = str( os.path.join(args.dictionary_path, args.target_language + '_phrase_dictionary.txt')) phrase_file = load_phrase_dictionary(phrase_file_name, 'a') logging.debug(f"Found existing phrase dictionary {phrase_file_name}") phrase_list = [line.rstrip('\n') for line in open(phrase_file_name)] for line in phrase_list: lst_line = line.split(',') if len(lst_line[0]) > 0: phrases[lst_line[0]] = lst_line[1] else: phrase_file = load_phrase_dictionary( os.path.join(args.dictionary_path, args.target_language + '_phrase_dictionary.txt'), 'a') for i, unit in enumerate(tmx_file.getunits()): if i < args.batch_start: continue if i > args.batch_end: break logging.info( f"Processing record {i} of {len(tmx_file.units)} (Batch start {args.batch_start} Batch end " f"{args.batch_end})") nlp_id = nlp_model_id(unit.getid()) nlp_target = nlp_model_target(unit.gettarget()) res_id = ke.textrank(nlp_id, normalize='lemma', include_pos=('NOUN', 'PROPN', 'ADJ', 'VERB'), window_size=5, edge_weighting='binary', position_bias=False, topn=5) res_target = ke.textrank(nlp_target, normalize='lemma', include_pos=('NOUN', 'PROPN', 'ADJ', 'VERB'), window_size=5, edge_weighting='binary', position_bias=False, topn=5) for r_id in res_id: if (len(r_id[0].split()) > 1) and (len(res_target) > 0): # We don't want single words, we want phrases if not r_id[0] in phrases: translation_results = call_translation( [{ 'Text': r_id[0] }], args.target_language, args.category_id, Config.SUBSCRIPTION_KEY, Config.REGION) if len(translation_results[0]['translations'][0]) > 0: for r_tar in res_target: if len(r_tar[0].split()) > 1: bleu_score = 0 # Let's only take exact matches if r_tar[0].lower().strip( ) == translation_results[0]['translations'][0][ 'text'].lower().strip(): bleu_score = 1 # We use absolute matches but keep this here for BLEU if needed print( f"Found {r_id[0]} : {r_tar[0].strip()}" ) phrases[r_id[0]] = r_tar[0].strip() # As the phrase dictionary is case sensitive, let's include a few variations phrase_file.write('\n' + r_id[0] + ', ' + r_tar[0].strip()) # TODO add BLEU evaluation if needed phrase_file.close()