Esempio n. 1
0
def sent_search(params):
    (task_list, query_iid, related_sent, input_dir) = params

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    freq = dict()

    for ent in query_iid.keys():
        freq.update({ent: {'total': 0}})

    context = dict((ent, []) for ent in query_iid.keys())
    iid_set = set(related_sent.keys())

    subcorpus = []
    for fname in task_list:

        with open('{}/{}'.format(input_dir, fname), 'r') as f:
            for line in tqdm(f, desc='{}'.format(fname), mininterval=10):
                doc = json.loads(line)
                if doc['iid'] in iid_set:
                    subcorpus.append(doc)
        f.close()

    for item_dict in tqdm(subcorpus,
                          desc='enrich-{}'.format(len(subcorpus)),
                          mininterval=10):

        doc = nlp(item_dict['text'])
        unigram = [
            token.lemma_
            for token in textacy.extract.ngrams(doc,
                                                n=1,
                                                filter_nums=True,
                                                filter_punct=True,
                                                filter_stops=True,
                                                include_pos=["NOUN"])
        ]
        item_dict['unigram'] = unigram
        tokens = [token.lemma_ for token in doc]
        item_dict['tokens'] = [
            token.lemma_ for token in doc if not token.is_punct
        ]
        pos = [token.pos_ for token in doc]
        phrases = phrasemachine.get_phrases(tokens=tokens,
                                            postags=pos,
                                            minlen=2,
                                            maxlen=8)
        item_dict['phrases'] = list(phrases['counts'])

        for ent in related_sent[item_dict['iid']]:

            context[ent].append(item_dict)

            freq[ent]['total'] += 1
            if item_dict['did'] in freq[ent]:
                freq[ent][item_dict['did']] += 1
            else:
                freq[ent].update({item_dict['did']: 1})

    return {'context': context, 'freq': freq}
Esempio n. 2
0
def test_ark_tags():
    '''
    If the user has provided coarsened tags in the 5 tag system (e.g. the ran ark tagger) 
    then phrasemachine should still work
    '''
    phrases = pm.get_phrases(tokens=["red", "car"], postags=["A", "N"])
    assert "red car" in set(
        phrases["counts"].keys()
    ), "used to break before coarsened tags added to coarsemap in phrasemachine.py"
Esempio n. 3
0
def select_best(Question, Sentence, tagger):
    c = list(phrasemachine.get_phrases(Question)['counts'])
    Sentence2 = Sentence.lower()
    loc_s = [Sentence2.find(i.lower()) for i in c]
    loc_s = [i for i in loc_s if i >= 0]
    if len(loc_s) == 0:
        return tagger[0]
    loc = [Sentence2.find(i.lower()) for i in tagger]
    dist = [sum([abs(i - j) for j in loc_s]) for i in loc]
    return tagger[np.argmin(dist)]
Esempio n. 4
0
def preprocess(dataset):

    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(sentencizer, before="parser")

    input_data = "corpora/data_" + dataset + ".json"
    output_spacy = "corpora/data_" + dataset + ".spacy.jsonl"

    with open(input_data, "r") as inf:
        comments = json.load(inf)

    with open(output_spacy, "w") as of:

        docid = 0
        for ix, comment in enumerate(comments):

            ori_doc = comment['comment']
            doc = comment['comment']

            #remove punctuation
            doc = doc.translate(str.maketrans('', '', string.punctuation))

            # remove whitespaces
            doc = re.sub(r"\s+", " ", doc)

            # returns a token stream
            doc = nlp(doc)

            tokens = [token.text for token in doc]
            #tokens = [token.lemma_ for token in doc]
            pos = [token.pos_ for token in doc]

            #tok spans for sentences
            sentences = [(o.start, o.end) for o in doc.sents]

            # need minlen=1 here b/c bigger phrases are sparse
            phrases = phrasemachine.get_phrases(minlen=1,
                                                tokens=tokens,
                                                postags=pos)

            comment["phrases"] = list(phrases["counts"].keys())

            comment["tokens"] = tokens

            comment["sentences"] = sentences

            comment["presentation_text_full"] = ori_doc

            comment["presentation_text_short"] = str(list(doc.sents)[0])

            if " ".join(tokens).lower() != "need more information":
                if " ".join(tokens).lower() != "not enough information":
                    comment["docid"] = docid
                    docid += 1
                    of.write(json.dumps(comment) + "\n")
Esempio n. 5
0
def select_best_2(Question, Sentence, tagger):
    c = list(phrasemachine.get_phrases(Question)['counts'])
    c = c + headword(Question)
    Sentence2 = Sentence.lower()
    loc_s = [i.lower() for i in c if i.lower() in Sentence]
    if len(loc_s) == 0:
        return tagger[0]
    dist = [
        sum([distance_between_word(k, q, Sentence) for k in loc_s])
        for q in tagger
    ]
    return tagger[np.argmin(dist)]
Esempio n. 6
0
def test_basic_tagging():
    # Have to pick an example easy for the tagger
    pp = pm.get_phrases("Red stock market",
                        output=['pos', 'tokens', 'token_spans', 'counts'])
    assert pp['pos'] == "JJ NN NN".split(
    ), "this test failure may be due to tagger uncertainty... though unlikely..."
    assert set(pp['token_spans']) == set([(0, 2), (0, 3), (1, 3)])

    assert len(pp['counts']) == 3
    assert pp['counts']['red stock'] == 1
    assert pp['counts']['red stock market'] == 1
    assert pp['counts']['stock market'] == 1
Esempio n. 7
0
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                doc = nlp(item_dict['text'])
                item_dict.update({'entityMentioned':mentioned_entity})
                unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)]
                item_dict['unigram'] = unigram
                tokens = [token.text for token in doc]
                pos = [token.pos_ for token in doc]
                phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos)
                item_dict['phrases'] = list(phrases['counts'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Esempio n. 8
0
def text_to_legacy_token_output(text, **phrasemachine_opts):
    """
    Augment the text with phrases and return it AS TEXT (unicode object).
    Designed for things like Mallet which expect just a bunch of tokens.
    Recommendation: use option tagger='spacy'
    """

    text = unicodify(text)

    phrasecounts = phrasemachine.get_phrases(text,
                                             output=['counts', 'tokens'],
                                             **phrasemachine_opts)
    phrases = u" ".join(" ".join([w.replace(" ", "_")] * c)
                        for w, c in phrasecounts['counts'].most_common(999999))
    out = [text]
    out.append("\n")
    out.append(phrases)
    return u"\n".join(out)
Esempio n. 9
0
def wordartify(text):

    doc = nlp(text)
    artifacts = []

    for sent in doc.sents:
        tokens = [token.text for token in sent]
        pos = [token.pos_ for token in doc]
        res = phrasemachine.get_phrases(tokens=tokens,
                                        postags=pos,
                                        output="token_spans")
        spans = merge_spans(res['token_spans'])

        phrases = [tokens[lo:hi] for lo, hi in spans]
        html = to_html([token for token in sent], spans)

        artifacts.append(WordArt(html))

    return artifacts
    def _phrases_in_raw_text_via_phrasemachine(self, raw_text):
        """
        Builds a list of phrases from raw text using phrasemachine.
        """
        # This returns a Dictionary of counts
        phrase_counts = phrasemachine.get_phrases(raw_text)['counts']

        phrases_in_document = []
        for unique_phrase in phrase_counts:
            # Fetch how many times this phrase occurred
            phrase_count = phrase_counts[unique_phrase]

            # Create N strings based on the count, since LDA will do the counts
            phrases_for_phrase_count = [unique_phrase] * phrase_count

            # Now that we have the phrase repeated, add them to the final
            # list of phrases.
            for phrase in phrases_for_phrase_count:
                phrases_in_document.append(phrase)

        return phrases_in_document
Esempio n. 11
0
def extract_phrases():
    phrases = {}
    vocab = set()

    print "Reading files and extracting phrases"
    input_dir = dirs.data_raw_sentences_dir
    files = glob.glob(os.path.join(input_dir, '*'))
    for f_i, f in enumerate(files):
        if (f_i % 10) == 0 and f_i > 0:
            print f_i
        f_name = os.path.splitext(os.path.basename(f))[0]
        text = fh.read_text(f)[0]
        phrases[f_name] = phrasemachine.get_phrases(text)['counts']
        vocab.update(phrases[f_name].keys())

    vocab = list(vocab)
    vocab.sort()
    vocab_index = dict(zip(vocab, range(len(vocab))))
    n_phrases = len(vocab)

    items = phrases.keys()
    items.sort()
    n_items = len(items)

    print "%d items, %d unique phrases" % (n_items, n_phrases)

    print "Building matrix"
    counts = lil_matrix((n_items, n_phrases))
    for f_i, f in enumerate(items):
        f_counts = phrases[f].values()
        f_phrases = phrases[f].keys()
        indices = [vocab_index[p] for p in f_phrases]
        counts[f_i, indices] = f_counts

    print "Saving files"
    output_dir = dirs.data_processed_phrasemachine_dir
    fh.save_sparse(counts, os.path.join(output_dir, 'phrases.npz'))
    fh.write_to_json({'index': items, 'vocab': vocab}, os.path.join(output_dir, 'phrases.json'), sort_keys=False)

    print "Done"
Esempio n. 12
0
def test_bad_counts_example_2():
    phrases = pm.get_phrases(
        "Social security is a law. Gravity is one too. Cheeseburgers are tasty. Social security is in a lockbox."
    )
    assert phrases['counts']['social security'] == 2
Esempio n. 13
0
 def go(tags, **kwargs):
     pp = pm.get_phrases(postags=tags, output='token_spans', **kwargs)
     return pp['token_spans']
Esempio n. 14
0
            for w in l["words"]:
                if w["tag"] == "A":
                    of.write(','.join([w["word"], player, race, position]) + "\n")

if args.phrases:
    from phrasemachine import get_phrases
    with open(fn + ".{}.mentions.phrases.csv".format(args.K), "w") as of:
        for l in all_mentions:
            player = " ".join(l["metadata"]["player"])
            race = l["metadata"]["race"]
            position = l["metadata"]["position"]
            toks = [i["word"] for i in l["words"]]
            pos = [i["tag"] for i in l["words"]]
            assert len(toks) == len(pos)
            try:
                phrases = get_phrases(tokens=toks, postags=pos)
            except IndexError:
                phrases = {"counts":{}}
            phrases = [o for o in phrases["counts"].keys()]
            As = [i["word"] for i in l['words'] if i['tag'] == 'A']
            phrasetoks = [i for p in phrases for i in p.split(" ")]
            for w in phrases:
                of.write(",".join([w,player,race,position]) + '\n')
            for a in As:
                if a not in phrasetoks:
                    of.write(",".join([a,player,race,position]) + '\n')


'''

def get_mentions(K, fn):
Esempio n. 15
0
 def procdoc(doc):
     phrasecounts = phrasemachine.get_phrases(doc['text'],
                                              output=['counts', 'tokens'],
                                              tagger='spacy')
     doc['tokens'] = phrasecounts['tokens']
     doc['phrase_counts'] = dict(phrasecounts['counts'])
Esempio n. 16
0
import phrasemachine
import re,sys,os
# import chardet

outdir = sys.argv[1]
files = sys.argv[2:]
os.system("mkdir -p %s" % outdir)
print "OUTPUT TO",outdir

d = 10
for i in range(0,len(files),d):
    batch = files[i:i+d]
    print "BATCH",i,batch
    alltext = "\n\n".join(open(f).read() for f in batch)
    alltext = alltext.decode("utf-8","ignore")
    # phrases="";words=""
    phrasecounts = phrasemachine.get_phrases(alltext, tagger='spacy', output=['counts','tokens'])
    phrases = u" ".join(" ".join([w.replace(" ","_")]*c) for w,c in phrasecounts['counts'].most_common(999999))
    words = u" ".join(phrasecounts['tokens'])

    print>>open("%s/batch%04d.txt" % (outdir,i),'w'), phrases.lower().encode("utf8"), words.lower().encode("utf8")
Esempio n. 17
0
# Output phrase instance extractions in a format intended to be diffable
import sys
sys.path.insert(0, "../py")
import phrasemachine

for filename in sys.argv[1:]:
    print "\n=== FILE", filename
    text = open(filename).read().decode("utf-8", 'replace')
    pp = phrasemachine.get_phrases(text,
                                   output=['token_spans', 'tokens', 'pos'])
    spans = pp['token_spans']
    spans.sort()  ## (s,e) in lexicographic order
    for s, e in spans:
        phrase = u" ".join(pp['tokens'][s:e]).lower()
        tagstr = " ".join(pp['pos'][s:e])
        out = u"%d %d\t%s\t%s" % (s, e, phrase, tagstr)
        print out.encode("utf-8")
Esempio n. 18
0
def test_multisentence():
    pp = pm.get_phrases("blue table. blue table. blue table.")
    print(pp)
    assert len(
        pp['counts']
    ) == 1  ## should be just 'blue table'.  if buggy, it can pick up spans across sentences
Esempio n. 19
0
def test_custom_regex():
    out = pm.get_phrases(tokens=["the", "red", "car"],
                         postags=["D", "A", "N"],
                         regex='DA')
    assert "the red" in set(out["counts"].keys()), "custom regex should work"
Esempio n. 20
0
import phrasemachine
import json
import os

# set wd
os.chdir(
    '/Users/matthewjdenny/Documents/Research/Congressional_Bill_Language/EMNLP_2016/phrasemachine/testdata/sotu'
)

# read in example data
infile = open("1985.txt")
text = infile.read()
infile.close()

# get phrases
phrases = phrasemachine.get_phrases(text, include_unigrams=True)

# write to json
os.chdir(
    '/Users/matthewjdenny/Documents/Research/Congressional_Bill_Language/EMNLP_2016/phrasemachine/R/comparison_tests'
)

with open('python_phrase_extractions.json', 'w') as outfile:
    json.dump(phrases, outfile)

# output POS tags
pos_tags = phrasemachine.get_phrases(text, output='pos')

with open('python_pos_tags.json', 'w') as outfile:
    json.dump(pos_tags, outfile)
Esempio n. 21
0
def sent_search(params):
    (task_list, args) = params

    query = args.query_string.split(',')

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    freq = dict()

    for ent in query:
        freq.update({ent: {'total': 0}})

    context = dict((ent, []) for ent in query)

    for fname in task_list:

        with open('{}/{}'.format(args.input_dir, fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=10):
            try:
                item_dict = json.loads(item)
            except:
                print(fname, item)
                sys.stdout.flush()
                continue

            entity_text = set([em for em in item_dict['entityMentioned']])

            for ent in query:
                if ent not in entity_text:
                    continue
                else:
                    doc = nlp(item_dict['text'])
                    if len(doc) >= 30 or item_dict['nsubj'] == [] or set(
                            item_dict['nsubj']).intersection(pronoun) != set():
                        continue
                    unigram = [
                        token.text
                        for token in textacy.extract.ngrams(doc,
                                                            n=1,
                                                            filter_nums=True,
                                                            filter_punct=True,
                                                            filter_stops=True)
                    ]
                    item_dict['unigram'] = unigram
                    tokens = [token.text for token in doc]
                    pos = [token.pos_ for token in doc]
                    phrases = phrasemachine.get_phrases(tokens=tokens,
                                                        postags=pos,
                                                        minlen=2,
                                                        maxlen=8)
                    item_dict['phrases'] = list(phrases['counts'])
                    context[ent].append(item_dict)

                    freq[ent]['total'] += 1
                    if item_dict['did'] in freq[ent]:
                        freq[ent][item_dict['did']] += 1
                    else:
                        freq[ent].update({item_dict['did']: 1})
    return {'context': context, 'freq': freq}
Esempio n. 22
0
def main_thrd(query_set, args, iindex):
    start_time = time.time()
    nlp = spacy.load('en_core_web_lg', disable=['ner'])
    
    unique_ent = set()

    for item in query_set:
        target = item['target']
        queries = item['queries']
        for query in queries:
            unique_ent = unique_ent.union(set(query))

    # ##### sentence search #####
    query_iid = {}
    related_sent = defaultdict(list)
    for ent in tqdm(unique_ent, desc='loading-entity', mininterval=10):
        mentions = set(iindex[ent])
        query_iid.update({ent:mentions})

    for k, v in tqdm(query_iid.items(), desc='related-sents', mininterval=10):
        for iid in v:
            related_sent[iid].append(k)

    input_files = os.listdir(args.input_dir)
    tasks = list(split(input_files, args.num_process))

    inputs = [(tasks[i], query_iid, related_sent, args.input_dir) for i in range(args.num_process)]

    with Pool(args.num_process) as p:
        search_results = p.map(sent_search, inputs)

    search_merge = search_results[0]['context']
    count_merge = search_results[0]['freq']

    for pid in range(1, len(search_results)):
        tmp_context = search_results[pid]['context']
        tmp_freq = search_results[pid]['freq']
        for ent in unique_ent:
            search_merge[ent] += tmp_context[ent]
            count_merge[ent]['total'] += tmp_freq[ent]['total']
            tmp_freq[ent].pop('total', None)
            count_merge[ent].update(tmp_freq[ent])

    for ent in unique_ent:
        for index in range(len(search_merge[ent])):
            search_merge[ent][index]['doc_score'] = count_merge[ent][search_merge[ent][index]['did']]/count_merge[ent]['total']

    print("--- search use %s seconds ---" % (time.time() - start_time))
    sys.stdout.flush()

    ### query processing ###
    num_query = args.num_query
    query_length = args.query_length
    eval_metric = {}
    bar = 1

    for item in query_set:
        top1_score = 0
        top5_score = 0
        top10_score = 0
        recall = 0
        norm_score = 0
        index = 0
        target = item['target']
        queries = item['queries']
        print('prcessing set: ', target)
        sys.stdout.flush()
        
        for query in queries:

            print('prcessing query: ', query)
            sys.stdout.flush()

            unigrams = []
            for ent in query:
                for sent in search_merge[ent]:
                    unigrams += sent['unigram']
            unigram_set = set(unigrams)

            print('(1/3) generate unigrams')
            sys.stdout.flush()

            N = 0
            cnt = Counter()
            for ent in query:
                N += len(search_merge[ent])
                for sent in search_merge[ent]:
                    cnt.update(sent['tokens'])
            cnt = dict(cnt)

            for ent in query:
                for word in nltk.word_tokenize(ent):
                    unigram_set.discard(word)
                    unigram_set.discard(LEMMA.lemmatize(word))

            idf = {}
            for key in cnt.keys():
                idf.update({key:np.log((N / cnt[key]))})

            print('(2/3) compute idf')
            sys.stdout.flush()

            context = ''
            for ent in query:
                context += ' '.join([sent['text'] for sent in search_merge[ent]])

            phrases = phrasemachine.get_phrases(context, minlen=2, maxlen=4)
            list_phrases = list(phrases['counts'])[:15]

            idf_list = [*idf]
            target_doc = nlp(target)
            target_token = [token.lemma_ for token in target_doc if not token.is_punct]

            params = (list_phrases, unigram_set, target_token, idf, 0)
            phrases_sorted = phrase_eval(params)

            print(phrases_sorted)
            print('(3/3) evaluate phrases')
            sys.stdout.flush()

            if phrases_sorted == []:
                continue

            top10 = [lab[0] for lab in phrases_sorted[:10]]
            best_phrase = phrases_sorted[0][0]
            best_sim = phrases_sorted[0][1]
            top5_sim = max([lab[1] for lab in phrases_sorted[:5]])
            top10_sim = max([lab[1] for lab in phrases_sorted[:10]])
            recall_rank = int(np.argmax([lab[1] for lab in phrases_sorted]))
            recall_phrase = phrases_sorted[recall_rank][0]
            recall_sim = phrases_sorted[recall_rank][1]
            norm_best_sim = best_sim / recall_sim if recall_sim != 0 else 0
            recall += recall_sim
            top1_score += best_sim
            top5_score += top5_sim
            top10_score += top10_sim
            norm_score += norm_best_sim
            meta = {'query':query, 'target': target, 'top10': top10, 'sim@1':best_sim, 'sim@5': top5_sim, 'sim@10': top10_sim, 'sim@full':(recall_phrase, recall_rank+1, recall_sim), 'norm_sim@1': norm_best_sim}
            print(meta)
            sys.stdout.flush()
            with open('{}/log-{}-{}.txt'.format(args.output_dir, query_length, args.sampling_method), 'a+') as f:
                f.write(json.dumps(meta) + '\n')
            f.close()
        
        top1_score /= num_query
        top5_score /= num_query
        top10_score /= num_query
        recall /= num_query
        norm_score /= num_query
        eval_metric.update({target:{'sim@1': top1_score, 'sim@5': top5_score, 'sim@10': top10_score, 'sim@full': recall, 'norm_sim@1': norm_score}})
        with open('{}/tfidf-sim-{}-{}.txt'.format(args.output_dir, query_length, args.sampling_method), 'a+') as f:
            f.write(json.dumps(eval_metric) + '\n')
        f.close()
        
        print('---- progess in {}/{} ----'.format(bar, len(query_set)))
        bar += 1
        sys.stdout.flush()
Esempio n. 23
0
Our mission is to nourish and defend freedom and democracy, and to communicate these ideals everywhere we can. America's economic success is freedom's success; it can be repeated a hundred times in a hundred different nations. Many countries in east Asia and the Pacific have few resources other than the enterprise of their own people. But through low tax rates and free markets they've soared ahead of centralized economies. And now China is opening up its economy to meet its needs. 
We need a stronger and simpler approach to the process of making and implementing trade policy, and we'll be studying potential changes in that process in the next few weeks. We've seen the benefits of free trade and lived through the disasters of protectionism. Tonight I ask all our trading partners, developed and developing alike, to join us in a new round of trade negotiations to expand trade and competition and strengthen the global economy—and to begin it in this next year. 
There are more than 3 billion human beings living in Third World countries with an average per capita income of $650 a year. Many are victims of dictatorships that impoverished them with taxation and corruption. Let us ask our allies to join us in a practical program of trade and assistance that fosters economic development through personal incentives to help these people climb from poverty on their own. 
We cannot play innocents abroad in a world that's not innocent; nor can we be passive when freedom is under siege. Without resources, diplomacy cannot succeed. Our security assistance programs help friendly governments defend themselves and give them confidence to work for peace. And I hope that you in the Congress will understand that, dollar for dollar, security assistance contributes as much to global security as our own defense budget. 
We must stand by all our democratic allies. And we must not break faith with those who are risking their lives—on every continent, from Afghanistan to Nicaragua—to defy Soviet-supported aggression and secure rights which have been ours from birth. 
The Sandinista dictatorship of Nicaragua, with full Cuban-Soviet bloc support, not only persecutes its people, the church, and denies a free press, but arms and provides bases for Communist terrorists attacking neighboring states. Support for freedom fighters is self-defense and totally consistent with the OAS and U.N. Charters. It is essential that the Congress continue all facets of our assistance to Central America. I want to work with you to support the democratic forces whose struggle is tied to our own security. 
And tonight, I've spoken of great plans and great dreams. They're dreams we can make come true. Two hundred years of American history should have taught us that nothing is impossible. 
Ten years ago a young girl left Vietnam with her family, part of the exodus that followed the fall of Saigon. They came to the United States with no possessions and not knowing a word of English. Ten years ago—the young girl studied hard, learned English, and finished high school in the top of her class. And this May, May 22d to be exact, is a big date on her calendar. Just 10 years from the time she left Vietnam, she will graduate from the United States Military Academy at West Point. I thought you might like to meet an American hero named Jean Nguyen. 
Now, there's someone else here tonight, born 79 years ago. She lives in the inner city, where she cares for infants born of mothers who are heroin addicts. The children, born in withdrawal, are sometimes even dropped on her doorstep. She helps them with love. Go to her house some night, and maybe you'll see her silhouette against the window as she walks the floor talking softly, soothing a child in her arms-Mother Hale of Harlem, and she, too, is an American hero. 
Jean, Mother Hale, your lives tell us that the oldest American saying is new again: Anything is possible in America if we have the faith, the will, and the heart. History is asking us once again to be a force for good in the world. Let us begin in unity, with justice, and love.
Thank you, and God bless you.

"""

import phrasemachine
phrases = phrasemachine.get_phrases(text)
print("%s phrase types" % len(phrases['counts']))
print("%s phrase hits" % sum(phrases['counts'].values()))
print("Top phrases:")
print(phrases['counts'].most_common(10))

print("From crappy tokenization:")
crappy_tokens = text.split()
print(phrasemachine.get_phrases(tokens=crappy_tokens)['counts'].most_common(10))

print("Phrase spans")
phrases = phrasemachine.get_phrases(text, output=['token_spans','tokens'])
print("%s phrase hits" % len(phrases['token_spans']))
print(phrases['token_spans'][:20])
print(phrases['token_spans'][-20:])
Esempio n. 24
0
                for e in doc.ents:
                    if e.label_ == 'PERSON':
                        newtext = newtext.replace(e.text, '$$$')
                    elif e.label_ == 'DATE':
                        newtext = newtext.replace(e.text, '$$$$')
                    elif e.label_ in skip_ents:
                        newtext = newtext.replace(e.text, '')
            newtext = newtext.replace('  ', ' ')
            newtext = newtext.replace('..', '.')
            newtext = newtext.lower()
            # newtext = newtext.rstrip(' ')
            with open(os.path.join('./Combined_modified_3', file), 'w') as f2:
                f2.write(newtext)
            f2.close()
            # newsent = newtext.split(".")
            candidate = phrasemachine.get_phrases(newtext)
            # for s in newsent:
            #     doc = nlp(s)
            #     for token in doc.noun_chunks:
            #         candidate.append(token.text)
            #     for ent in doc.ents:
            #         candidate.append(ent.text)
            # candidate = set(candidate)
            with open(os.path.join('./Combined_candidate_3', file), 'w') as f2:
                for c in candidate['counts']:
                    # print(c)
                    f2.write(c)
                    f2.write(",")
            f2.close()
            print(file, "No error")