Esempio n. 1
0
def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(vocab,
                    path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with io.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Esempio n. 2
0
class SpacyParser(object):
    '''https://spacy.io/#example-use'''
    def __init__(self, num_threads=4):
        
        self.nlp = English(tokenizer=True, parser=True, tagger=True,
                           entity=None, matcher=None)
    
    def parse(self, doc, doc_id=None):
        """Parse a raw document as a string into a list of sentences"""
        if len(doc.strip()) == 0:
            return
        doc = doc.decode("utf-8")
        for doc in self.nlp.pipe([doc], batch_size=50, n_threads=4):
            assert doc.is_parsed
                    
        for sent_id, sent in enumerate(doc.sents):
            tokens = [t for t in sent]
            token_idxs = [t.idx for t in sent]
            words = [t.text for t in sent]
            lemmas = [self.nlp.vocab.strings[t.lemma] for t in tokens]
            poses = [self.nlp.vocab.strings[t.tag] for t in tokens]
            dep_labels = [self.nlp.vocab.strings[t.dep] for t in tokens]
            # index tokens to determine sentence offset for dependency tree
            token_idx = {t:i for i,t in enumerate(tokens)}
            dep_parents = [token_idx[t.head] for t in tokens] 
            
            s = Sentence(words=words,lemmas=lemmas,poses=poses, 
                         dep_parents=dep_parents, dep_labels=dep_labels, 
                         sent_id=sent_id, doc_id=doc_id, text=sent.text,
                         token_idxs=token_idxs, doc_name=doc_id )

            yield s
Esempio n. 3
0
    def test_thinc_load(self):
        data_dir = English.default_data_dir()
        model_loc = path.join(data_dir, 'deps', 'model')

        # n classes. moves.n_moves above
        # n features. len(templates) + 1 above
        model = LinearModel(92, 116)
        model.load(model_loc)
Esempio n. 4
0
def vocab():
    vocab = English.default_vocab()
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
    lex = vocab['quick']
    lex = vocab['jumped']
    return vocab
Esempio n. 5
0
    def test_load_careful(self):
        config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}

        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))

        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])

        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))

        parser = Parser(vocab.strings, moves, model)
Esempio n. 6
0
def vocab():
    data_dir = os.environ.get('SPACY_DATA')
    if data_dir is None:
        package = util.get_package_by_name('en')
    else:
        package = util.get_package(data_dir)

    vocab = English.default_vocab(package=package)
    lex = vocab['dog']
    assert vocab[vocab.strings['dog']].orth_ == 'dog'
    lex  = vocab['the']
    lex = vocab['quick']
    lex = vocab['jumped']
    return vocab
Esempio n. 7
0
def main():
    nlp = English(parser=False, tagger=False, entity=False)

    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
    pattern_ids = PreshMap()
    max_length = 0
    for pattern_str in gazetteer:
        pattern = nlp.tokenizer(pattern_str)
        bilou_tags = get_bilou(len(pattern))
        for word, tag in zip(pattern, bilou_tags):
            lexeme = nlp.vocab[word.orth]
            lexeme.set_flag(tag, True)
        pattern_ids[hash_string(pattern.text)] = True
        max_length = max(max_length, len(pattern))

    matcher = make_matcher(nlp.vocab, max_length)

    doc = nlp(example_text)
    matches = get_matches(matcher, pattern_ids, doc)
    merge_matches(doc, matches)
    for token in doc:
        print(token.text, token.ent_type_)
def preprocess(texts):
    nlp = English()
    docs = nlp.pipe(texts)

    for doc in docs:
        for np in doc.noun_chunks:
            # Only keep adjectives and nouns, e.g. "good ideas"
            while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
                np = np[1:]
            if len(np) > 1:
                # Merge the tokens, e.g. good_ideas
                np.merge(np.root.tag_, np.text, np.root.ent_type_)
            # Iterate over named entities
        for ent in doc.ents:
            if len(ent) > 1:
                # Merge them into single tokens
                ent.merge(ent.root.tag_, ent.text, ent.label_)

        sentences = []

        for sent in doc.sents:
            sentences.append([token.text for token in sent])

        yield sentences
Esempio n. 9
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
corpus = Corpus(filename=download("parliament-corpus"))

questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

#Preprocessing
#create spacy object
spacy_NLP = spacy.load('en')
vocab = English().vocab

question_fit_file = os.path.join(questionTypology.motifs_dir,
                                 'question_fits.json')

superset_file = os.path.join(questionTypology.motifs_dir,
                             'question_supersets_arcset_to_super.json')

question_to_leaf_fits = []

question_threshold = questionTypology.question_threshold

super_mappings = {}
with open(superset_file) as f:
    for line in f.readlines():
        entry = json.loads(line)
Esempio n. 11
0
        temp.append(nlp.vocab.strings[token.dep])
        temp.append([token.head.orth_,t[token.head.idx]])
        dep_triple.append(temp)
    return dep_triple

    
if __name__=='__main__':
    #print 'data'
    
    if len(sys.argv) !=3:
        print 'usage: python pyfile dir_path input_name outputname'
        exit(1)
    dir_path = sys.argv[1]
    f_input = dir_path+sys.argv[2]
   
    nlp= English()
    texts = []
    stime = time.time()
    with codecs.open(f_input,'r','utf-8') as file:
        for line in file:
            line = line.strip()
            lineNo,sentence,tags,tags_er = line.split('\t')
            texts.append(lineNo+sentence)
    etime = time.time()
    print 'load tests time:',etime - stime       
             
    pool = Pool(30)
    try:
        DT_result = [generateDT(doc) for doc in nlp.pipe(texts, n_threads=30, batch_size=100)]
    except:
        print 'read file exception'
Esempio n. 12
0
 def test_load(self):
     vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
Esempio n. 13
0
    project_path, "data/reviews/review_16_{0}_with_term.train".format(genre))
test_data_file = os.path.join(
    project_path, "data/reviews/review_16_{0}_with_term.test".format(genre))

if genre == 'laptop':
    word2vec_model = '/Users/yinfei.yang/workspace/nlp/word2vec/models/vectors-reviews-electronics.bin'
elif genre == 'restaurants':
    word2vec_model = '/Users/yinfei.yang/workspace/nlp/word2vec/models/vectors-reviews-restaurants.bin'

w2v_model = Word2Vec.load_word2vec_format(word2vec_model, binary=True)

data_helpers.load_data_and_term_labels(train_data_file, test_data_file)
x_text_train, y_train_labels, x_text_test, y_test_labels, labels = \
        data_helpers.load_data_and_term_labels(train_data_file, test_data_file)

en = English()
cd = 0
total = 0

for text, labels in zip(x_text_test, y_test_labels):

    doc = en(u'{0}'.format(text))
    noun_chunks = [str(nc) for nc in doc.noun_chunks]

    #words = text.split()

    for label in labels:
        total += 1

        label_term = label[1]
        flag = False
Esempio n. 14
0
 def __init__(self, num_threads=4):
     
     self.nlp = English(tokenizer=True, parser=True, tagger=True,
                        entity=None, matcher=None)
Esempio n. 15
0
    def setTrainingVars(self,
                        P,
                        corp,
                        num_topics,
                        NTest,
                        NTrain,
                        lapp="",
                        includeLabels=False):
        self.includeLabels = includeLabels
        self.T = NTest
        self.TRAIN = NTrain
        self.corpus = corp
        self.dfs = self.corpus.dfs()
        self.K = num_topics

        loc = lapp + "exports/" + P + "/lda_states/ldapy" + str(self.K)
        self.lda = models.ldamodel.LdaModel.load(loc)

        for z in range(0, self.K):
            topic = self.lda.state.get_lambda()[z]
            topic = topic / topic.sum()
            bestn = matutils.argsort(topic, 100, reverse=True)
            terms = [(id, topic[id]) for id in bestn]

            #terms = lda.get_topic_terms(z,100)
            for term in terms:
                word = corp.dictionary[term[0]].lower()
                weight = term[1]
                occurences = self.dfs[term[0]]
                #idf = log(corpus.documentCount/(1+occurences))
                if word in self.wordweights:
                    if weight > self.wordweights[word]:
                        self.wordweights[word] = weight  #* idf
                else:
                    self.wordweights[word] = weight  #* idf
        #print('\n\n')

        with open(lapp + "exports/" + P + "/good_ADJ.txt", "r") as f:
            for line in f:
                self.good_adjs.append(line.strip())

        with open(lapp + "exports/" + P + "/bad_ADJ.txt", "r") as f:
            for line in f:
                self.bad_adjs.append(line.strip())

        with open(lapp + "exports/" + P + "/good_NOUN.txt", "r") as f:
            for line in f:
                self.good_verbs.append(line.strip())

        with open(lapp + "exports/" + P + "/bad_NOUN.txt", "r") as f:
            for line in f:
                self.bad_verbs.append(line.strip())

        with open(lapp + "exports/" + P + "/featuresAprioriLexicalPruned.txt",
                  "r") as f:
            for line in f:
                self.product_features.append(line.strip())

        with open(lapp + "inputs/badwords.txt", "r") as f:
            for line in f:
                self.bad_words.append(line.decode('utf-8').strip())
        self.currentGenerator = NTrain * 2

        self.nnn = NTrain * 2

        self.nlp = English()
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.d = enchant.Dict("en_US")
Esempio n. 16
0
import codecs
import cPickle
from spacy.en import English
import spacy

tokenizer = English(parser=False)
en_nlp = spacy.load('en')
name = "semeval_metonymic_test"  # Please specify the input file name.
label = 1  # 1 is for METONYMY/NON-LITERAL, 0 is for LITERAL
inp = codecs.open("data/" + name + ".txt", mode="r", encoding="utf-8")
# PLEASE FORMAT THE INPUT FILE AS ONE SENTENCE PER LINE. SEE BELOW:
# ENTITY<SEP>sentence<ENT>ENTITY<ENT>rest of sentence.
# Germany<SEP>Their privileges as permanent Security Council members, especially the right of veto,
# had been increasingly questioned by <ENT>Germany<ENT> and Japan which, as major economic powers.
out = []
seq_length = 5  # A window of 5 is the DEFAULT for the PUBLICATION methodology. Feel free to experiment.


def locate_entity(document, ent, left_w, right_w):
    left_w = '' if len(left_w) == 0 else left_w[-1].text
    right_w = '' if len(right_w) == 0 else right_w[0].text
    for doc in document:
        if doc.text == ent[0]:
            index = doc.i
            if left_w == '' or document[index - 1].text == left_w:
                if right_w == '' or document[index + len(ent)].text == right_w:
                    return index + len(ent) - 1
    raise Exception(
    )  # If this is ever triggered, there are problems parsing the text. Check SpaCy output!

Esempio n. 17
0
def test_single_token_string():
    nlp = English()
    tokens = nlp(u'foobar')
    assert tokens[0].string == 'foobar'
Esempio n. 18
0
 def __init__(self):
     self.nlp = English(parser=False, tagger=False, entity=False)
Esempio n. 19
0
 def __init__(self, additive=0, multiplicative=1):
     self.additive = additive
     self.multiplicative = multiplicative
     self.nlp = English(parser=False, tagger=False, entity=False)
file: word_container.py
description: a vector storage datastructure for word vectors
author: Luke de Oliveira ([email protected])
copyright: 2017 Vai Technologies, LLC. All Rights Reserved.
"""

import logging
import numpy as np

from spacy.en import English

from .token_container import TokenContainer

logger = logging.getLogger(__name__)

NLP = English()


def case(s, lower):
    if lower:
        return s.lower()
    return s


class WordVectorBoxException(Exception):
    """ Errors for VectorBox. """
    pass


class WordContainer(TokenContainer):
    """docstring for CharacterContainer"""
Esempio n. 21
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-model', type=str, required=True)
    parser.add_argument('-weights', type=str, required=True)
    parser.add_argument('-results', type=str, required=True)
    args = parser.parse_args()

    model = model_from_json(open(args.model).read())
    model.load_weights(args.weights)
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

    questions_val = open('../data/preprocessed/questions_val2014.txt',
                         'r').read().decode('utf8').splitlines()
    questions_lengths_val = open(
        '../data/preprocessed/questions_lengths_val2014.txt',
        'r').read().decode('utf8').splitlines()
    answers_val = open('../data/preprocessed/answers_val2014.txt',
                       'r').read().decode('utf8').splitlines()
    images_val = open('../data/preprocessed/images_val2014.txt',
                      'r').read().decode('utf8').splitlines()
    vgg_model_path = '../features/coco/vgg_feats.mat'

    questions_lengths_val, questions_val, answers_val, images_val = (
        list(t) for t in zip(*sorted(
            zip(questions_lengths_val, questions_val, answers_val,
                images_val))))

    print 'Model compiled, weights loaded'
    labelencoder = joblib.load('../models/labelencoder.pkl')

    features_struct = scipy.io.loadmat(vgg_model_path)
    VGGfeatures = features_struct['feats']
    print 'Loaded vgg features'
    image_ids = open('../features/coco/coco_vgg_IDMap.txt').read().splitlines()
    img_map = {}
    for ids in image_ids:
        id_split = ids.split()
        img_map[id_split[0]] = int(id_split[1])

    nlp = English()
    print 'Loaded word2vec features'

    nb_classes = 1000
    y_predict_text = []
    batchSize = 128
    widgets = [
        'Evaluating ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=widgets)

    for qu_batch, an_batch, im_batch in pbar(
            zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]),
                grouper(answers_val, batchSize, fillvalue=answers_val[0]),
                grouper(images_val, batchSize, fillvalue=images_val[0]))):
        timesteps = len(nlp(
            qu_batch[-1]))  #questions sorted in descending order of length
        X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps)
        if 'language_only' in args.model:
            X_batch = X_q_batch
        else:
            X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures)
            X_batch = [X_q_batch, X_i_batch]
        y_predict = model.predict_classes(X_batch, verbose=0)
        y_predict_text.extend(labelencoder.inverse_transform(y_predict))

    incorrect_val = 0
    correct_val = 0
    f1 = open(args.results, 'w')
    for prediction, truth, question, image in zip(y_predict_text, answers_val,
                                                  questions_val, images_val):
        temp_count = 0
        for _truth in truth.split(';'):
            if prediction == _truth:
                temp_count += 1

        if temp_count > 2:
            correct_val += 1
        else:
            incorrect_val += 1

        f1.write(question.encode('utf-8'))
        f1.write('\n')
        f1.write(image.encode('utf-8'))
        f1.write('\n')
        f1.write(prediction)
        f1.write('\n')
        f1.write(truth.encode('utf-8'))
        f1.write('\n')
        f1.write('\n')

    f1.write('Final Accuracy is ' +
             str(float(correct_val) / (incorrect_val + correct_val)))
    f1.close()
    f1 = open('../results/overall_results.txt', 'a')
    f1.write(args.weights + '\n')
    f1.write(str(float(correct_val) / (incorrect_val + correct_val)) + '\n\n')
    f1.close()
    print 'Final Accuracy on the validation set is', float(correct_val) / (
        incorrect_val + correct_val)
Esempio n. 22
0
def getpatents(targetyear):
    timestart = time.time()
    getpatents_directory = os.getcwd()  # get current working directory
    errorCount = 0
    nlp = English()
    PATCOUNT_ORIGIN = [
        205, 260, 280, 264, 298, 301, 293, 282, 332, 315, 346, 311, 265, 326,
        375, 309, 348, 339, 446, 490, 488, 628, 723, 827, 884, 968, 1002, 1084,
        1304, 1482, 1648, 1843, 2251, 2928, 3639, 3958, 3623, 2927, 2047, 904,
        99
    ]
    store_chunks_FullParent = []

    getpatents_directory_output = getpatents_directory + "/Patents/"
    if not os.path.exists(getpatents_directory_output):
        os.mkdir(getpatents_directory_output, 0o755)

    csvfile_PatNUM = open(
        '(03) SolarPV_41585 Patent List ORIGINAL with dssc patents 9501 v0.2 only num.csv',
        'r')
    csvfile_ouput_by_year = open(
        getpatents_directory_output + str(targetyear) + '.csv', 'w+')

    reader_PatNO = csv.reader(csvfile_PatNUM, delimiter=' ', quotechar='|')
    writer_yearoutput = csv.writer(csvfile_ouput_by_year,
                                   delimiter=',',
                                   quotechar=',',
                                   quoting=csv.QUOTE_MINIMAL)

    PATNUM = []
    splited_words = 0
    sumstart = 0
    normalcount = 0

    for PatCountofYear in PATCOUNT_ORIGIN:  # this for loop make PAT_HEADER set into right position.
        if splited_words >= targetyear - 1976:  # e.g , if targetyaer is 1977, PAT_HEADER need to start from 206. so sumstart=205 and PAT HEADER started from 206.
            break
        else:
            sumstart += PatCountofYear
            splited_words += 1
    PAT_HEADER = sumstart  # if PAT_HEADER=n,PAT_HEADER pointed nth row exactly in reader_PatNO.

    for PATNO in reader_PatNO:
        PATNUM.append(PATNO[0])
    # print(PATNUM[206])
    while normalcount < PATCOUNT_ORIGIN[targetyear % 1976]:
        PAT_HEADER += 1  # HEADER가 어디선가 +1이 안되고있움.
        # print(PAT_HEADER)
        if PAT_HEADER == (sumstart + PATCOUNT_ORIGIN[targetyear % 1976] + 1):
            # HEADER value exceed valid range of year's patent count.
            break

        url = ''.join([
            'https://patents.google.com/patent/', PATNUM[PAT_HEADER - 1], '/en'
        ])  # row 1 in reader_PatNO is stored PATNUM[0].

        print("\nURL NUMBER : " + str(PAT_HEADER) + " = " + url + "\n")

        urlText, backCitation, pubDate = readWEBSITE.getText(url)

        if urlText is None:  # error occur at parsing patent.
            errorCount += 1
            continue
        normalcount += 1

        doc = nlp(urlText.decode('utf-8'))
        chunks_store = []
        store_chunks_singlePatent = []
        store_chunks_singlePatent.append(PAT_HEADER)

        for word in doc.noun_chunks:
            chunks_store.append(word)

        for span in chunks_store:

            store_str = span.text  # get text part of span in chunks_store.
            splited_words = store_str.split()
            splited_word = []
            # store_chunks_singlePatent.append()
            for splited_single_word in splited_words:
                stop_TF = False
                ########### Below down is temporary 'stop word list' ##########
                if splited_single_word == 'a':
                    stop_TF = True
                if splited_single_word == 'A':
                    stop_TF = True
                if splited_single_word == 'an':
                    stop_TF = True
                if splited_single_word == 'An':
                    stop_TF = True
                if splited_single_word == 'the':
                    stop_TF = True
                if splited_single_word == 'The':
                    stop_TF = True
                if splited_single_word == 'THE':
                    stop_TF = True
                if splited_single_word == 'this':
                    stop_TF = True
                if splited_single_word == 'This':
                    stop_TF = True
                if splited_single_word == 'their':
                    stop_TF = True
                if splited_single_word == 'Their':
                    stop_TF = True
                if splited_single_word == 'Such':
                    stop_TF = True
                if splited_single_word == 'such':
                    stop_TF = True
                if splited_single_word == 'it':
                    stop_TF = True
                if splited_single_word == 'It':
                    stop_TF = True
                if splited_single_word == 'they':
                    stop_TF = True
                if splited_single_word == 'They':
                    stop_TF = True
                if splited_single_word == 'these':
                    stop_TF = True
                if splited_single_word == 'These':
                    stop_TF = True
                if stop_TF is True:  # if word is in stopword list,check next word.
                    continue
                else:
                    splited_word.append(splited_single_word)

            combinedWord = " ".join(splited_word)  # join word into one string.
            if combinedWord is "":  # if string is null, string of word don't append into list.
                continue
            store_chunks_singlePatent.append(
                combinedWord
            )  # store_chunks_str_singlePatents store all of words used in particular PATENT.

        store_chunks_FullParent.append(
            store_chunks_singlePatent
        )  # store_chunks_str_FulParent store all of word used in all patent of targetyear.

    for row_input in store_chunks_FullParent:
        row_input = str(row_input)
        writer_yearoutput.writerow(row_input.split(","))

    print("Error occur {0} times. Success {1} times\n".format(
        errorCount, normalcount))
    csvfile_PatNUM.close()
    csvfile_ouput_by_year.close()
    timeend = time.time()
    print("it takes {0} sec for the get Patent text of {1}".format(
        (timeend - timestart), targetyear))

    return None
Esempio n. 23
0
import re
import os
import numpy as np
import json
import pickle
import datetime
import spacy
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize, sent_tokenize
from load_squad_wiki_data import get_squad_data, get_squad_wiki_data
from gensim.models import Word2Vec
from spacy.en import English
nlp = spacy.load('en', parser = False, matcher = False, add_vectors = False)
nlp_en = English()

class MakeIter(object):
    def __init__(self, generator_func, **kwargs):
        self.generator_func = generator_func
        self.kwargs = kwargs
    def __iter__(self):
        return self.generator_func(**self.kwargs)
    
class Embeddings:
    def __init__(self, size, window, min_count, workers):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]])
        self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name)
        self.path_word_tokenized_sentence = '../data/word_tokenized_sentence_{0}.json'.format(base_file_name)
def main(filename, systemname, print_us, print_ont, statistics, link, prolog,
         per_role, threshold, base, weights):
    """General class to run the entire program
	"""

    # Initialize spaCy just once (this takes most of the time...)
    print("Initializing Natural Language Processor . . .")
    start_nlp_time = timeit.default_timer()
    nlp = English()
    nlp_time = timeit.default_timer() - start_nlp_time

    start_parse_time = timeit.default_timer()
    miner = StoryMiner()

    # Read the input file
    set = Reader.parse(filename)
    us_id = 1

    # Keep track of all errors
    success = 0
    fail = 0
    list_of_fails = []
    errors = ""
    c = Counter()

    # Keeps track of all succesfully created User Stories objects
    us_instances = []
    failed_stories = []

    # Parse every user story (remove punctuation and mine)
    for s in set:
        try:
            user_story = parse(s, us_id, systemname, nlp, miner)
            user_story = c.count(user_story)
            success = success + 1
            us_instances.append(user_story)
        except ValueError as err:
            failed_stories.append([us_id, s, err.args])
            errors += "\n[User Story " + str(us_id) + " ERROR] " + str(
                err.args[0]) + "! (\"" + " ".join(str.split(s)) + "\")"
            fail = fail + 1
        us_id = us_id + 1

    # Print errors (if found)
    if errors:
        Printer.print_head("PARSING ERRORS")
        print(errors)

    parse_time = timeit.default_timer() - start_parse_time

    # Generate the term-by-user story matrix (m), and additional data in two other matrices
    start_matr_time = timeit.default_timer()

    matrix = Matrix(base, weights)
    matrices = matrix.generate(us_instances, ''.join(set), nlp)
    m = matrices[0]
    count_matrix = matrices[1]
    stories_list = matrices[2]
    rme = matrices[3]

    matr_time = timeit.default_timer() - start_matr_time

    # Print details per user story, if argument '-u'/'--print_us' is chosen
    if print_us:
        print("Details:\n")
        for us in us_instances:
            Printer.print_us_data(us)

    # Generate the ontology
    start_gen_time = timeit.default_timer()

    patterns = Constructor(nlp, us_instances, m)
    out = patterns.make(systemname, threshold, link)
    output_ontology = out[0]
    output_prolog = out[1]
    output_ontobj = out[2]
    output_prologobj = out[3]
    onto_per_role = out[4]

    # Print out the ontology in the terminal, if argument '-o'/'--print_ont' is chosen
    if print_ont:
        Printer.print_head("MANCHESTER OWL")
        print(output_ontology)

    gen_time = timeit.default_timer() - start_gen_time

    # Gather statistics and print the results
    stats_time = 0
    if statistics:
        start_stats_time = timeit.default_timer()

        statsarr = Statistics.to_stats_array(us_instances)

        Printer.print_head("USER STORY STATISTICS")
        Printer.print_stats(statsarr[0], True)
        Printer.print_stats(statsarr[1], True)
        Printer.print_subhead(
            "Term - by - User Story Matrix ( Terms w/ total weight 0 hidden )")
        hide_zero = m[(m['sum'] > 0)]
        print(hide_zero)

        stats_time = timeit.default_timer() - start_stats_time

    # Write output files
    w = Writer()

    folder = "output/" + str(systemname)
    reports_folder = folder + "/reports"
    stats_folder = reports_folder + "/stats"

    outputfile = w.make_file(folder + "/ontology", str(systemname), "omn",
                             output_ontology)
    files = [["Manchester Ontology", outputfile]]

    outputcsv = ""
    sent_outputcsv = ""
    matrixcsv = ""

    if statistics:
        outputcsv = w.make_file(stats_folder, str(systemname), "csv",
                                statsarr[0])
        matrixcsv = w.make_file(stats_folder,
                                str(systemname) + "-term_by_US_matrix", "csv",
                                m)
        sent_outputcsv = w.make_file(stats_folder,
                                     str(systemname) + "-sentences", "csv",
                                     statsarr[1])
        files.append(["General statistics", outputcsv])
        files.append(["Term-by-User Story matrix", matrixcsv])
        files.append(["Sentence statistics", sent_outputcsv])
    if prolog:
        outputpl = w.make_file(folder + "/prolog", str(systemname), "pl",
                               output_prolog)
        files.append(["Prolog", outputpl])
    if per_role:
        for o in onto_per_role:
            name = str(systemname) + "-" + str(o[0])
            pont = w.make_file(folder + "/ontology", name, "omn", o[1])
            files.append(["Individual Ontology for '" + str(o[0]) + "'", pont])

    # Print the used ontology generation settings
    Printer.print_gen_settings(matrix, base, threshold)

    # Print details of the generation
    Printer.print_details(fail, success, nlp_time, parse_time, matr_time,
                          gen_time, stats_time)

    report_dict = {
        "stories":
        us_instances,
        "failed_stories":
        failed_stories,
        "systemname":
        systemname,
        "us_success":
        success,
        "us_fail":
        fail,
        "times": [[
            "Initializing Natural Language Processor (<em>spaCy</em> v" +
            pkg_resources.get_distribution("spacy").version + ")", nlp_time
        ], ["Mining User Stories", parse_time],
                  ["Creating Factor Matrix", matr_time],
                  ["Generating Manchester Ontology", gen_time],
                  ["Gathering statistics", stats_time]],
        "dir":
        os.path.dirname(os.path.realpath(__file__)),
        "inputfile":
        filename,
        "inputfile_lines":
        len(set),
        "outputfiles":
        files,
        "threshold":
        threshold,
        "base":
        base,
        "matrix":
        matrix,
        "weights":
        m['sum'].copy().reset_index().sort_values(
            ['sum'], ascending=False).values.tolist(),
        "counts":
        count_matrix.reset_index().values.tolist(),
        "classes":
        output_ontobj.classes,
        "relationships":
        output_prologobj.relationships,
        "types":
        list(count_matrix.columns.values),
        "ontology":
        Utility.multiline(output_ontology)
    }

    # Finally, generate a report
    report = w.make_file(reports_folder,
                         str(systemname) + "_REPORT", "html",
                         generate_report(report_dict))
    files.append(["Report", report])

    # Print the location and name of all output files
    for file in files:
        if str(file[1]) != "":
            print(
                str(file[0]) + " file succesfully created at: \"" +
                str(file[1]) + "\"")
Esempio n. 25
0
class Sentence:

    # SCORENLP = StanfordCoreNLP("/Users/rajpav/anaconda2/lib/python2.7/site-packages/stanford-corenlp-full-2016-10-31")
    SCORENLP = StanfordCoreNLP(
        "/Users/acharya.n/anaconda2/lib/python2.7/stanford-corenlp-full-2016-10-31"
    )

    TEXT_LEMMA_PATTERN = re.compile('(\[{1})([a-zA-Z0-9.= $_<>\"\/?]+)(\]{1})')
    PARTS_OF_SPEECH_PATTERN = re.compile(
        '(\({1})([a-zA-Z0-9.= $_<>\-\"\/?]+)(\){1})')
    NON_ALLOWED_NOUN_CHUNKS = ["how", "many", "much"]
    STRING_TO_DICT_PATTERN = re.compile(r'(\S+)=(".*?"|\S+)')
    SINGULAR_PRONOUN = ['he', 'she', 'it', 'him', 'her', 'his']
    SINGULAR_SUBJECT_PRONOUN = ['he', 'she', 'him', 'her', 'his']
    SINGULAR_OBJECT_PRONOUN = ['it']
    PLURAL_PRONOUN = ['they', 'them']
    LEMMATIZER_MODULE = LemmatizerModule()
    SPACY_PARSER = English()

    def __init__(self, sentence_json, question, sentence_index):
        self.m_predicted_label = sentence_json["PredictedLabel"]
        self.m_sentence_text = sentence_json["Sentence"]
        self.m_syntactic_pattern = sentence_json["SyntacticPattern"]
        ###print self.m_sentence_text
        self.m_has_a_cardinal = False
        self.m_cardinal = None
        self.m_has_a_dobj = False
        self.m_dobj = None
        self.m_has_a_nsubj = False
        self.m_nsubj = None
        self.m_pobj = None
        self.m_has_a_pobj = False
        self.m_quantified_entity = None
        self.m_owner_entity = None
        self.m_object_entity = None
        self.m_evaluating_subject = None
        self.m_evaluating_object = None
        self.m_has_a_pronoun = False
        self.m_processed_pronoun = None
        self.m_transfer_entity = None
        self.m_transfer_quantified_entity = None
        self.m_all_pronouns = []
        self.m_all_nouns = []
        self.m_all_noun_lemmas = []
        self.m_question = question
        self.m_words_pos = OrderedDict()
        self.m_is_first_word_an_expletive = True if self.m_syntactic_pattern[
            0] == 'E' else False
        self.m_expletive_index = -1
        if 'E' in self.m_syntactic_pattern:
            self.m_expletive_index = self.m_syntactic_pattern.index('E')

        self.m_sentence_index = sentence_index
        self.m_is_pronoun_noun_found = False
        self.m_current_pronoun = None
        self.temp_transfer_entity = None
        self.temp_dobj = None
        self.m_has_an_unknown_quantity = False
        self.m_possible_evaluating_subjects = []
        self.m_possible_evaluating_object = None
        self.m_question_label = None
        self.m_complex_nouns = []
        self.m_sentece_words = []
        self.m_words_index = {}
        self.m_compound_modifiers = []
        question_label_string = "QuestionLabel"
        if self.m_predicted_label == '?' and question_label_string in sentence_json:
            self.m_question.m_evaluating_sentence = self
            self.m_question_label = sentence_json["QuestionLabel"]

    def __str__(self):
        return self.m_sentence_text

    def parse_sentence(self):
        self.extract_dependencies()
        #         self.process_pronouns()
        if self.m_predicted_label == '?':
            self.extract_evaluation_entities()
        else:
            self.extract_entities()

    def extract_dependencies(self):
        print 'in extract dep'
        print self.m_sentence_text
        corenlp_result = json.loads(
            Sentence.SCORENLP.parse(self.m_sentence_text))
        current_sentence = corenlp_result["sentences"][0]
        parse_tree = current_sentence["parsetree"]
        #         print 'parse_tree',parse_tree
        self.m_dependencies = current_sentence["dependencies"]
        #         self.m_matched_tuples = Sentence.TEXT_LEMMA_PATTERN.findall(parse_tree)
        #         print 'matched tuples',self.m_matched_tuples
        #         print self.m_dependencies
        self.m_matched_pos = Sentence.PARTS_OF_SPEECH_PATTERN.findall(
            parse_tree)
        #         print self.m_matched_pos
        index_counter = 0
        for matched_pos in self.m_matched_pos:
            index_counter = index_counter + 1
            word_pos = matched_pos[1].split(" ")
            parts_of_speech = word_pos[0]
            word = word_pos[1].lower()
            #             #print word
            self.m_words_index[word] = index_counter
            self.m_sentece_words.append(word)
            self.m_words_pos[word] = parts_of_speech

            if parts_of_speech in PublicKeys.NOUN_POS:
                lemma = Sentence.LEMMATIZER_MODULE.lemmatize(word)
                self.m_all_noun_lemmas.append(lemma)
                self.m_all_nouns.append(word)
                if parts_of_speech == 'NNP':
                    self.m_question.add_proper_noun(word)
            if parts_of_speech == 'CD':
                self.m_has_a_cardinal = True
                if self.m_expletive_index != -1:
                    self.m_is_first_word_an_expletive = True
                try:
                    #                     #print 1
                    #                     #print word
                    #                     #print 2
                    #                     #print float(word)
                    #                     #print 3
                    #                     #print str(float(word))
                    self.m_cardinal = Decimal(word)
#                     #print self.m_cardinal
                except:
                    self.m_cardinal = PublicKeys.text2int(word)
                    #                     #print self.m_cardinal
                    self.m_words_index[str(self.m_cardinal)] = index_counter
#                     #print 'insert'
#                     #print self.m_words_index[str(self.m_cardinal)]
                if self.m_predicted_label == '-':
                    self.m_cardinal = -self.m_cardinal
            elif parts_of_speech == 'PRP' or parts_of_speech == 'PRP$':
                ###print 'found pronoun'
                self.m_has_a_pronoun = True
                self.m_all_pronouns.append(word)
                ###print self.m_is_pronoun_noun_found
                if self.m_is_pronoun_noun_found == False:
                    ###print 'In sentence'
                    ###print self.m_sentence_index
                    ###print self.m_question.m_coref_dict
                    ###print self.m_question.m_coref_dict[self.m_sentence_index]
                    current_sentence_coref_dict = self.m_question.m_coref_dict[
                        self.m_sentence_index]

                    ###print 'pronoun not found yet' + word
                    ###print current_sentence_coref_dict
                    if word in current_sentence_coref_dict:
                        ###print 'word in dict true'
                        current_pronoun_noun = current_sentence_coref_dict[
                            word]
                        ###print 'current_pronoun_noun' + current_pronoun_noun
                        ###print self.m_question.m_proper_nouns
                        if current_pronoun_noun.lower(
                        ) in self.m_question.m_proper_nouns:
                            self.m_processed_pronoun = current_pronoun_noun
                            self.m_is_pronoun_noun_found = True
                            self.m_current_pronoun = word
                            ###print "Pronoun Noun :" + self.m_processed_pronoun

        if (self.m_predicted_label == '-' or self.m_predicted_label == '+'
                or self.m_predicted_label
                == '=') and self.m_has_a_cardinal == False:
            self.m_has_a_cardinal = True
            self.m_cardinal = 'X'
            self.m_has_an_unknown_quantity = True

    def extract_entities(self):
        print 'in extract entities'
        sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text)
        spacy_subj = None
        temp_pobj = None
        for token in sentence_parse:
            token_dep = token.dep_
            print(token.orth_, token.dep_, token.head.orth_,
                  [t.orth_
                   for t in token.lefts], [t.orth_ for t in token.rights])
            if token_dep == 'pobj':
                temp_pobj = token
            elif token_dep == 'nsubj' or token_dep == 'nsubjpass':
                spacy_subj = token.orth_.lower()
            elif token_dep == 'poss':
                self.assign_poss_entities(token)
            elif token_dep == 'compound' or token_dep == 'amod':
                print 'in compound and amod case'
                modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_)
                compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize(
                    token.head.orth_)
                compound_modifier = CompoundModifier(modifier, compound_dobj)
                print 'found compound modifier:', modifier, compound_dobj
                self.m_compound_modifiers.append(compound_modifier)
                self.m_complex_nouns.append(modifier + " " + compound_dobj)
#                 self.temp_dobj = compound_dobj

        sentence_svos = findSVOs(sentence_parse)
        print "svos", sentence_svos, len(sentence_svos)
        if len(sentence_svos) > 0:
            transfer_entity_relation = None
            #             #print 'starts with an expl:',self.m_is_first_word_an_expletive
            if self.m_is_first_word_an_expletive == False:

                print 'svo'
                print sentence_svos[0][0]
                print sentence_svos[0][2]

                #                 trying to assign subj and obj from svo
                self.assign_nsubj(sentence_svos[0][0])
                self.assign_dobj(sentence_svos[0][2])

                print 'after trying to assign subj', self.m_nsubj
                print 'after trying to assign dobj:'
                print 'dobj exists?:', self.m_has_a_dobj
                print 'dobj:', self.m_dobj
                print 'temp dobj:', self.temp_dobj
                #print temp_pobj

                if self.m_has_a_dobj == False:
                    if self.temp_dobj != None:
                        print 'before temp dobj'
                        self.assign_dobj(self.temp_dobj)
                        if self.temp_transfer_entity != None:
                            self.assign_transfer_entity(
                                self.temp_transfer_entity, 'dobj')
                    elif temp_pobj != None:
                        print 'before temp pobj'
                        self.assign_dobj(temp_pobj.orth_.lower())
                        #self.assign_dobj(self.m_pobj, 'pobj')
                        self.assign_transfer_entity(sentence_svos[0][2],
                                                    'dobj')
                elif temp_pobj != None:
                    print 'in temp dobj != None'
                    self.assign_transfer_entity(temp_pobj.orth_.lower(),
                                                'pobj')
                elif self.temp_transfer_entity != None:
                    print 'in temp transfer entity !- None'
                    self.assign_transfer_entity(self.temp_transfer_entity,
                                                'poss')
            else:
                #                 #print 'before 2nsd svo'
                self.assign_dobj(sentence_svos[0][2])

                if temp_pobj != None:
                    self.assign_nsubj(temp_pobj.orth_.lower())
            ###print 'before calling extract quantified'
            self.extract_quantified_entities(True, transfer_entity_relation)
        elif spacy_subj != None and temp_pobj != None:
            self.temp_dobj = temp_pobj.orth_
            print 'In spacy'
            #print self.temp_dobj
            self.assign_nsubj(spacy_subj)
            self.assign_dobj(self.temp_dobj)
            self.extract_quantified_entities(False, None)
        elif spacy_subj != None and self.m_question.m_question_label != 'c':

            #             print 'spacy_subj is not none'
            self.assign_dobj(spacy_subj)
            self.extract_quantified_entities(False, None)

        elif self.m_question.m_question_label == 'c':
            if self.m_has_a_cardinal:
                print 'found nothing should do something.'
                quantified_non_entity = QuantifiedNonEntity(self.m_cardinal)
                if spacy_subj != None:
                    self.assign_nsubj(spacy_subj)
                    quantified_non_entity.set_owner_entity(self.m_owner_entity)
                    self.m_question.add_quantified_non_entity(
                        quantified_non_entity)

    def assign_nsubj(self, subj):
        self.m_has_a_nsubj = True
        self.m_nsubj = subj
        if self.m_nsubj in self.m_all_pronouns:
            self.m_nsubj = self.m_processed_pronoun
        self.m_owner_entity = Entity('nsubj', self.m_nsubj)

    def assign_dobj(self, dobj):
        #         #print dobj
        #print 'in assigning dobj'
        is_dobj_integer = self.is_integer(dobj)
        if is_dobj_integer == False and dobj not in self.m_question.m_proper_nouns:
            self.m_has_a_dobj = True
            self.m_dobj = dobj
        elif dobj in self.m_question.m_proper_nouns:
            self.temp_transfer_entity = dobj
            if self.temp_transfer_entity in self.m_all_pronouns:
                self.temp_transfer_entity = self.m_processed_pronoun
        elif is_dobj_integer == True:
            for k, v in self.m_question.m_quantified_entities.items():
                if self.m_has_a_dobj:
                    break
                for e in v:
                    print 'assigning cardianl d object'
                    self.m_has_a_dobj = True
                    self.m_dobj = unicode(e.get_name())
                    self.m_words_pos[e.get_name()] = 'NN'
                    self.m_words_index[e.get_name()] = self.m_words_index[dobj]
                    break

    def assign_pobj(self, token):
        token_orth = token.orth_.lower()
        ###print token_orth
        ###print self.m_question.get_quantified_entities()
        ###print self.m_question.get_quantified_entity_objects()
        if token_orth in self.m_question.get_quantified_entities():
            ###print 'assigning pobj'
            self.m_pobj = token_orth
            if self.m_pobj in self.m_all_pronouns:
                self.m_pobj = self.m_processed_pronoun

            self.m_has_a_pobj = True
        #elif token_orth in self.m_question.get_quantified_entity_objects():
        else:
            self.temp_dobj = token_orth

    def assign_poss_entities(self, token):
        self.temp_transfer_entity = token.orth_.lower()
        self.temp_dobj = token.head.orth_.lower()

    def assign_transfer_entity(self, val, pos):
        if val != None:
            val = val.lower()
            ##print 'in assign transfer entity:' + val
            if val in self.m_all_pronouns:
                val = self.m_processed_pronoun

            self.m_transfer_entity = Entity(pos, val)

    def extract_normal_entities(self):
        transfer_entity_relation = None
        for dependency in self.m_dependencies:
            relation = dependency[0]
            if relation == 'nsubj':
                self.m_has_a_nsubj = True
                self.m_nsubj = dependency[2]
                self.m_owner_entity = Entity('nsubj', self.m_nsubj)
            elif relation == 'dobj':
                self.m_has_a_dobj = True
                self.m_dobj = dependency[2]
            elif relation == 'nmod:to' or relation == 'nmod:from' or relation == 'nmod:poss' or relation == 'iobj':
                transfer_entity_relation = relation
                if self.m_has_a_pronoun:
                    self.m_transfer_entity = Entity(
                        relation, unicode(self.m_processed_pronoun, "utf-8"))
                else:
                    self.m_transfer_entity = Entity(relation, dependency[2])
        self.extract_quantified_entities(True, transfer_entity_relation)

    def extract_quantified_entities(self, to_create_transfer_entity,
                                    transfer_entity_relation):
        ##print self.m_transfer_entity
        #         ##print self.m_owner_entity
        print 'in extract quantified entities'
        if self.m_cardinal != None and self.m_has_an_unknown_quantity == False:
            self.validate_dobj_index()
            print 'in cardinal case and no unknown quantity'
            #             #print self.m_dobj

            lemmatized_dobj = Sentence.LEMMATIZER_MODULE.lemmatize(self.m_dobj)
            compound_modifier = self.get_compound_modifier_for_dobj(
                self.m_dobj)

            if self.m_owner_entity != None:
                print 'owner entity not none'
                ##print self.m_dobj
                ##print type(self.m_dobj)
                ##print self.m_dobj.lower()

                owner_modified_cardinal = self.m_cardinal

                if self.m_has_an_unknown_quantity:
                    if self.m_predicted_label == '-':
                        owner_modified_cardinal = "-" + self.m_cardinal
                        transfer_transaction_cardinal = self.m_cardinal
                    else:
                        owner_modified_cardinal = self.m_cardinal
                        transfer_transaction_cardinal = "-" + self.m_cardinal
                else:
                    transfer_transaction_cardinal = -self.m_cardinal
                print 'after calc transfer cardinal:', transfer_transaction_cardinal
                temp_quantified_entity = QuantifiedEntity(
                    owner_modified_cardinal, 'dobj', lemmatized_dobj, False)
                temp_quantified_entity.set_owner_entity(self.m_owner_entity)

                transfer_transaction = TransferTransaction(
                    to_create_transfer_entity, self.m_transfer_entity,
                    lemmatized_dobj, transfer_transaction_cardinal)
                temp_quantified_entity.add_transfer_transaction(
                    transfer_transaction)

                print 'after merging', compound_modifier
                if compound_modifier != None:
                    print 'modifier quantity,'
                    compound_modifier.set_quantity(owner_modified_cardinal)
                    temp_quantified_entity.add_compound_modifier(
                        compound_modifier)
                    for k, v in self.m_question.m_quantified_entities.items():
                        for e in v:
                            print 'comparisons:', e.get_name(
                            ), compound_modifier.m_dobj
                            if e.get_name() == compound_modifier.m_dobj:
                                print 'adding compoung modifier'
                            e.add_compound_modifier(compound_modifier)
                merge_entities = self.get_or_merge_entity(
                    temp_quantified_entity, transfer_transaction)

#                 self.m_quantified_entity = temp_quantified_entity if merge_entities == True else None
            else:
                self.m_owner_entity = Entity("global", u"global")
                global_modified_cardinal = self.m_cardinal
                if self.m_has_an_unknown_quantity:
                    if self.m_predicted_label == '-':
                        global_modified_cardinal = "-" + self.m_cardinal
                    else:
                        global_modified_cardinal = self.m_cardinal
                elif global_modified_cardinal < 0:
                    global_modified_cardinal = -global_modified_cardinal

                temp_quantified_entity = QuantifiedEntity(
                    global_modified_cardinal, 'dobj', lemmatized_dobj, False)
                temp_quantified_entity.set_owner_entity(self.m_owner_entity)
                merge_entities = self.get_or_merge_entity(
                    temp_quantified_entity, None)

#                 self.m_quantified_entity = temp_quantified_entity if merge_entities == True else None

            if to_create_transfer_entity and self.m_transfer_entity != None:
                ##print 'creating transfer entity'

                transfer_modified_cardinal = self.m_cardinal
                if self.m_has_an_unknown_quantity:
                    if self.m_predicted_label == '+':
                        transfer_modified_cardinal = "-" + self.m_cardinal
                        transfer_transaction_cardinal = self.m_cardinal
                    else:
                        transfer_modified_cardinal = self.m_cardinal
                        transfer_transaction_cardinal = "-" + self.m_cardinal
                else:
                    transfer_modified_cardinal = -self.m_cardinal
                    transfer_transaction_cardinal = self.m_cardinal

                ##print transfer_modified_cardinal
                temp_transfer_quantified_entity = QuantifiedEntity(
                    transfer_modified_cardinal, transfer_entity_relation,
                    lemmatized_dobj, True)
                temp_transfer_quantified_entity.set_owner_entity(
                    self.m_transfer_entity)
                transfer_transaction = TransferTransaction(
                    to_create_transfer_entity, self.m_owner_entity,
                    lemmatized_dobj, transfer_transaction_cardinal)
                temp_transfer_quantified_entity.add_transfer_transaction(
                    transfer_transaction)

                to_merge_transfer_entity = self.get_or_merge_entity(
                    temp_transfer_quantified_entity, transfer_transaction)
                self.m_transfer_quantified_entity = temp_transfer_quantified_entity if to_merge_transfer_entity == True else None

        else:
            self.m_object_entity = Entity('dobj', self.m_dobj)

    def get_compound_modifier_for_dobj(self, dobj):
        dobj = Sentence.LEMMATIZER_MODULE.lemmatize(dobj)
        print 'In compound modifier for dobj', dobj, len(
            self.m_compound_modifiers)
        compound_modifier = None
        for modifier in self.m_compound_modifiers:
            print 'modifier dobj', modifier.m_dobj
            if dobj == modifier.m_dobj:
                compound_modifier = modifier
                break
        return compound_modifier

    def validate_dobj_index(self):
        num = self.m_cardinal
        #         #print self.m_words_index
        #         #print num
        #         #print self.m_dobj
        if num < 0:
            num = -num
        if self.m_dobj == None:
            dobj_index = 0
        else:
            if self.m_dobj.lower() in self.m_words_index:
                dobj_index = self.m_words_index[self.m_dobj.lower()]
            else:
                dobj_index = 0

            dobj_lower = self.m_dobj.lower()
            #             print 'pos before prp',self.m_words_pos
            #             print 'dobj before prp',self.m_dobj
            if self.m_words_pos[self.m_dobj.lower(
            )] == 'PRP' or self.m_words_pos[dobj_lower] == 'PRP$':
                for k, v in self.m_question.m_quantified_entities.items():
                    if self.m_has_a_dobj:
                        break
                    for e in v:
                        #print 'assigning pronoun object'
                        self.assign_dobj(unicode(e.get_name()))
                        break

        cardinal_index = self.m_words_index[str(num)] if str(
            num) in self.m_words_index else self.m_words_index[str(int(num))]
        if dobj_index < cardinal_index:
            current_possible_obj = None

            to_consider_for_objects = []
            for current_word in self.m_words_index:
                current_word_index = self.m_words_index[current_word]
                if current_word_index > cardinal_index and (
                        current_word in self.m_all_nouns
                        or current_word in self.m_all_pronouns):
                    current_possible_obj = Sentence.LEMMATIZER_MODULE.lemmatize(
                        current_word)
                    break
            if current_possible_obj != None:
                self.assign_dobj(unicode(current_possible_obj))

    def get_or_merge_entity(self, temp_entity, transfer_transaction):
        to_merge_entities = self.m_question.add_quantified_entity(temp_entity)
        print 'to merge?'
        ##print to_merge_entities
        if to_merge_entities:
            self.merge_entities(temp_entity, transfer_transaction)
        elif self.m_predicted_label == '=':
            temp_entity.flip_equal_to_state()

        return to_merge_entities

    def merge_entities(self, temp_quantified_entity, transfer_transaction):
        ##print "in merge"
        quantified_entities = self.m_question.get_quantified_entities()
        subject = temp_quantified_entity.get_owner_entity().get_name()
        #sentence_output = self.output(True)
        sentence_output = temp_quantified_entity.get_cardinal()
        subject_quantified_entities = quantified_entities[subject]
        for subject_quantified_entity in subject_quantified_entities:
            if subject_quantified_entity.get_name(
            ) == temp_quantified_entity.get_name():
                if self.m_predicted_label == '=':
                    subject_quantified_entity.set_equal_to_state(
                        sentence_output)
                else:
                    subject_quantified_entity.perform_operation(
                        sentence_output, self.m_has_an_unknown_quantity,
                        transfer_transaction)
                ##print subject_quantified_entity

    def extract_evaluation_entities(self):
        sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text)
        # print "in extract evaluating entities"
        # print sentence_parse
        for token in sentence_parse:
            if token.dep_ == 'compound' or token.dep_ == 'amod':
                print 'in compound and amod case'
                modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_)
                compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize(
                    token.head.orth_)
                compound_modifier = CompoundModifier(modifier, compound_dobj)
                print 'found compound modifier:', modifier, compound_dobj
                self.m_compound_modifiers.append(compound_modifier)
                self.m_complex_nouns.append(
                    Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_) + " " +
                    Sentence.LEMMATIZER_MODULE.lemmatize(token.head.orth_))

        ##print 'In extract evaluating entities'
#         noun_chunks = self.get_noun_chunks(self.m_sentence_text)
#         if self.m_is_pronoun_noun_found == True:
##print self.m_processed_pronoun

#         for index, val in enumerate(noun_chunks):
#             val_lower_unicode = val.lower()
#             for word in Sentence.NON_ALLOWED_NOUN_CHUNKS:
#                 if word in val_lower_unicode:
#                     val_lower_unicode = val_lower_unicode.replace(word,'')
# #                     noun_chunks[index] = val_lower_str.replace(word,'')
#                     ##print noun_chunks
#             ##print 'Before assigning chunk'
#             ##print val_lower_unicode
#             noun_chunks[index] = val_lower_unicode.strip()
##print "after removing non allowed chunks: ", noun_chunks

#             chunk_split = val.split()
#             if len(chunk_split) > 1:
#                 sentence_split = self.m_sentence_text.split()
#                 lemma_sentence = ''
#                 for sentence_split_word in sentence_split:
#                     lemma_sentence = lemma_sentence + ' ' + Sentence.LEMMATIZER_MODULE.lemmatize(sentence_split_word)
#                 ##print lemma_sentence
#                 noun_chunks = self.get_noun_chunks(lemma_sentence)

#         for index, val in enumerate(noun_chunks):
#             if val == self.m_current_pronoun:
#                 noun_chunks[index] = self.m_processed_pronoun
#             noun_chunks[index] = Sentence.LEMMATIZER_MODULE.lemmatize(unicode(noun_chunks[index])).lower()

##print 'After lemmatizing and pronoun replacement'
##print noun_chunks

#         for noun in noun_chunks:
#             if noun in self.m_question.get_quantified_entities():
#                 self.m_possible_evaluating_subjects.append(noun)
#             elif self.m_possible_evaluating_object == None:
#                 self.m_possible_evaluating_object = noun
##print 'possible subjects'
##print self.m_possible_evaluating_subjects
##print 'possible object'
##print self.m_possible_evaluating_object

#         for dependency in self.m_dependencies:
#             if dependency[0] == 'nsubj':
#                 self.m_has_a_nsubj  = True
#                 self.m_nsubj = dependency[2]
#                 self.m_evaluating_subject = Entity('nsubj', self.m_nsubj)
#             elif dependency[0] == 'dobj':
#                 # extract parts of speech of the relation dep and gov
#                 # if none of them is noun. apply some logic to find the evaluating object
#                 ##print self.m_words_pos
#                 temp_dobj = dependency[2]
#                 temp_dobj_pos = self.m_words_pos[temp_dobj]
#                 if temp_dobj_pos != None and temp_dobj_pos in PublicKeys.NOUN_POS:
#                     self.m_has_a_dobj = True
#                     self.m_dobj = dependency[2]
#                     self.m_evaluating_object = Entity('dobj', self.m_dobj)
#                 else:
#                     ##print 'Couldn\'t find a dobj noun'
#                     max = 0
#                     matching_noun = None
#                     for noun in self.m_all_noun_lemmas:
#                         for qes in self.m_question.get_quantified_entities().values():
#                             for qe in qes:
#                                 wup_similarity = self.word_similarity(noun, qe.get_name())
#                                 if max < wup_similarity:
#                                     max = wup_similarity
#                                     matching_noun = qe
#
#                     self.m_evaluating_object = Entity('dobj', matching_noun.get_name())

    def get_noun_chunks(self, text):
        response = unirest.post(
            "https://textanalysis.p.mashape.com/spacy-noun-chunks-extraction",
            headers={
                "X-Mashape-Key":
                "KRSu5yA8domshWMHNzhofCid2f3fp1aOWWsjsnuS3zN7CYN9Kq",
                "Content-Type": "application/x-www-form-urlencoded",
                "Accept": "application/json"
            },
            params={"text": text})
        ##print response.body
        ##print response.raw_body
        response_json = json.loads(response.raw_body)
        #         print 'response:',response_json
        ##print response_json["result"]
        return response_json["result"]

    def word_similarity(self, word1, word2):
        xx = wn.synsets(word1, pos=wn.NOUN)
        yy = wn.synsets(word2, pos=wn.NOUN)
        max = 0
        for x in xx:
            for y in yy:
                wup_similarity = x.wup_similarity(y)
                max = wup_similarity if max < wup_similarity else max
        return max

    def is_integer(self, val):
        is_integer = True
        try:
            dummy = int(val)
        except:
            is_integer = False
        return is_integer

    def extract_result(self):
        ##print 'In extract result'
        quantified_entities = self.m_question.get_quantified_entities()
        result = None
        if self.m_question_label == 'all':
            return QuestionSentenceSolver.solve_for_all_label(self)
        elif self.m_question_label == '+':
            return QuestionSentenceSolver.solve_for_plus_label(self)
        elif self.m_question_label == 'c':
            return ComparisonSentenceSolver.solve_for_c_label(self)
        elif self.m_question_label == 'b':
            return ButConjunctionSentenceSolver.solve_for_but_label(self)
        elif self.m_question_label == 'u':
            return UnknownSentenceSolver.solve_for_unknown_label(self)
        else:
            return None
        # if len(self.m_possible_evaluating_subjects) == 1:
        #     subject = self.m_possible_evaluating_subjects[0]
        #     if subject in quantified_entities:
        #         subjects_object_entities = quantified_entities[subject]
        #
        #         for subjects_object_entity in subjects_object_entities:
        #             ##print 'during comparison'
        #             ##print subjects_object_entity
        #             ##print self.m_possible_evaluating_object
        #             if subjects_object_entity.get_name() == self.m_possible_evaluating_object:
        #                 result = subjects_object_entity
        #                 break
#         return result

#         subjects_object_entities = quantified_entities[self.m_evaluating_subject.get_name()]
#         result = None
#         ##print subjects_object_entities
#         for subjects_object_entity in subjects_object_entities:
#             ##print subjects_object_entity
#             ##print self.m_evaluating_object
#             if subjects_object_entity.get_name() == self.m_evaluating_object.get_name():
#                 result = subjects_object_entity
#                 break
#         return result

    def process_pronouns(self):
        ##print 'process pronouns'
        if self.m_has_a_pronoun == True:
            singular_pronouns = []
            plural_pronouns = []
            nouns = self.m_question.get_quantified_entities().keys()
            for pronoun_tuple in self.m_all_pronouns:
                pronoun = pronoun_tuple["Text"].lower()
                if pronoun in Sentence.SINGULAR_PRONOUN:
                    singular_pronouns.append(pronoun_tuple)
                    for noun in reversed(nouns):
                        ##print 'found' + noun
                        self.m_processed_pronoun = noun
                        break
                elif pronoun in Sentence.PLURAL_PRONOUN:
                    self.sum_all_entities()

#     def sum_all_entities(self):
##print "do something"

    def output(self, ret_math_value):
        if ret_math_value == True:
            output = self.m_cardinal
        else:
            if self.m_predicted_label == '+' or self.m_predicted_label == '-':
                if self.m_cardinal != None:
                    output = self.m_quantified_entity
                else:
                    output = self.m_predicted_label + ' ' + 'X'
        return output
Esempio n. 26
0
def test_period():
    EN = English()
    tokens = EN.tokenizer('best.Known')
    assert len(tokens) == 3
    tokens = EN('zombo.com')
    assert len(tokens) == 1
Esempio n. 27
0
def EN():
    return English()
Esempio n. 28
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
import time

start = time.time()
pt.train(sentences_w_tags, nr_iter=5)
end = time.time()

print "time taken = " + str(end - start)


# In[2]:

import os
from spacy.en import English, LOCAL_DATA_DIR, DOC
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(parser=False, entity=False, data_dir=data_dir)


# In[17]:

corpus = "\n".join(" ".join(y) for y in [x[0] for x in sentences_w_tags])

doc = nlp(unicode(corpus))


# In[36]:


n = 0
print doc[n].lemma_
print doc[n].pos_
Esempio n. 30
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
Esempio n. 31
0
from collections import OrderedDict
from spacy.en import English  # NLP with spaCy https://spacy.io
nlp = English()  # will take some time to load

# Useful properties, summary of the docs from https://spacy.io

# class Doc
# properties: text, vector, vector_norm, ents, noun_chunks, sents
# method: similarity
# NER specs https://spacy.io/docs#annotation-ner
# doc tokenization will preserve meaningful units together

# class Token
# token.doc -> parent sequence
# string features: text, lemma, lower, shape
# boolean flags: https://spacy.io/docs#token-booleanflags
# POS: pos_, tag_
# tree: https://spacy.io/docs#token-navigating
# ner: ent_type, ent_iob

# class Span
# span.doc -> parent sequence
# vector, vector_norm
# string features: text, lemma
# methods: similarity
# syntactic parse: use root, lefts, rights, subtree
# https://spacy.io/docs#span-navigativing-parse

# !more to implement:
# also filter to prepare for tree
# syntactic parse tree https://spacy.io/docs#span-navigativing-parse
class SpacyEventExtractor:
    _nlp = English()
    _keywords = list(map(lambda s: s.strip().lower(), open('keywords.txt', 'r').readlines()))
    _known_phrases = [('is out', 'released'), ('is here', 'released'), ('is there', 'released'),
                      ('is out', 'released'), ('is open', 'started'), ('is available', 'released'),
                      ('please welcome', 'we released')
                     ]
    _important_actions = ['release', 'start', 'publish', 'announce', 'update']

    def __init__(self):
        pass

    @staticmethod
    def _have_pronouns(text: str) -> bool:
        pronouns = ['i', 'you', 'he', 'she', 'they', 'be', 'him', 'her', 'it']
        # 'we' is a good pronoun as it refers to a company
        return list(filter(lambda s: s.lower() in pronouns, text.split())) != []

    @staticmethod
    def _is_present_simple(verb: spacy.tokens.Token) -> bool:
        for child in verb.children:
            if child.orth_ == 'will':
                return False  # will have etc
        lemma = verb.lemma_.lower()
        if verb.orth_.lower() in [lemma, lemma + 's', lemma + 'es', 'have', 'has', 'do', 'is', 'are']:
            return True
        return False

    @staticmethod
    def _is_present_continuous(verb: spacy.tokens.Token) -> bool:
        for child in verb.children:
            if child.dep_ == 'aux' and child.lemma_ not in ['be', 'is', 'are', 'am']:
                return False  # will have etc
        return verb.orth_.endswith('ing')

    @staticmethod
    def _get_tree(root: spacy.tokens.Token, depth: int, token_filter: types.FunctionType) -> [spacy.tokens.Token]:
        """Get list of tokens dependent on given root and satisfying given token_filter"""
        if depth == 0:
            return [root] if token_filter(root) else []

        result = []
        # for tokens on the left of the root, whose head is root
        for child in filter(token_filter, root.lefts):
            result += SpacyEventExtractor._get_tree(child, depth - 1, token_filter)
        result.append(root)
        # for tokens on the right of the root, whose head is root
        for child in filter(token_filter, root.rights):
            result += SpacyEventExtractor._get_tree(child, depth - 1, token_filter)
        return result

    @staticmethod
    def _get_chunk(token: spacy.tokens.Token) -> str:
        """Get string representation of a chunk.
        Chunk is one or more tokens that forms semantic unit.
        For example, compound tokens or tokens with dependent tokens."""

        if token is None:
            return ""

        def token_filter(tok):
            """True for various modifiers of tok and compound tokens, which include tok"""
            return tok is token or \
                   tok.dep_.endswith("mod") or \
                   tok.dep_ == "compound"

        tree = SpacyEventExtractor._get_tree(root=token, depth=2, token_filter=token_filter)
        return " ".join(map(str, tree))

    @staticmethod
    def _get_prep_with_word(token: spacy.tokens.Token) -> (str, spacy.tokens.Token):
        """Get prepositional modifiers of the token and important perposition's child"""
        if token is None:
            return "", None

        prep = None
        # search of prepositions
        for child in token.rights:
            if child.dep_ == "prep":
                prep = child
                break
        if prep is None:
            return "", None

        for word in prep.children:
            # if preposition has child of type 'object of preposition' or 'complement of a preposition'
            # then add it to the result
            if word.dep_ in ["pobj", "pcomp"]:
                chunk_str = SpacyEventExtractor._get_chunk(word)
                return str(prep) + " " + chunk_str, word

        return "", None

    @staticmethod
    def _get_full_entity(entity: spacy.tokens.Token) -> str:
        """Get entity token with all related tokens (i.e. prepositional modifiers)
        so, we are extracting such token tree with entity
        entity
            mod & compound
                mod & compound
            prep
                pobj | pcomp
                    mod & compound
                        mod & compound
                    (repeat)
                    prep
                        pobj | pcomp
                            mod & compound
                                mod & compound
                            (repeat)
                            ...
        """
        entity_string = SpacyEventExtractor._get_chunk(entity)

        word = entity
        while True:
            prep, word = SpacyEventExtractor._get_prep_with_word(word)
            if word is None:
                break
            entity_string += " " + prep
        return entity_string

    @staticmethod
    def _replace_we(replace_we, string):
        """Replace pronoun 'we' in string with string 'replace_we'"""
        new_string = ""
        for word in string.split():
            if word == "we" and replace_we is not None:
                new_string += replace_we + " "
            elif word == "We" and replace_we is not None:
                new_string += replace_we.capitalize() + " "
            else:
                new_string += str(word) + " "
        return new_string

    @staticmethod
    def _remove_extra_whitespaces(text):
        return ' '.join(text.strip().split())

    @staticmethod
    def _get_entity1(span):
        """Get nominal subject of the span's root, if there is one"""
        for word in span:
            if word.head is word: # main verb
                for child in word.children:
                    if child.dep_.endswith("nsubj"):
                        return child
                break
        return None

    @staticmethod
    def _get_action(verb):
        """Get auxiliary verbs of the given verb and the verb itself"""
        aux_verbs = ""
        for child in verb.children:
            if child.dep_ == "aux" or child.dep_ == "neg":
                aux_verbs += str(child)
        return SpacyEventExtractor._remove_extra_whitespaces(str(aux_verbs) + ' ' + str(verb))

    @staticmethod
    def _get_entity2(verb):
        """Get direct object of the given verb, if there is one"""
        for child in verb.children:
            if child.dep_ == "dobj":
                return child
        return None

    @staticmethod
    def extract(text: str, replace_we: str = None) -> [Event]:

        # just because sometimes spaCy fails on sth like we've
        for aux, replace_with in [('ve', 'have'), ('re', 'are')]:
            text = text.replace("'" + aux, " " + replace_with).replace("’" + aux, " " + replace_with)

        # replacing known_phrases
        for abbr, full in SpacyEventExtractor._known_phrases:
            reg = re.compile(abbr, re.IGNORECASE)
            text = reg.sub(full, text)

        if len(text) == 0:
            return []

        text_doc = SpacyEventExtractor._nlp(text)

        events = []
        keywords_set = set(SpacyEventExtractor._keywords)
        for doc in text_doc.sents:
            # if there is no at least one keyword - we ignore that sentence
            if len(set([word.string.strip().lower() for word in doc]) & keywords_set) == 0:
                continue

            entity1 = SpacyEventExtractor._get_entity1(doc)
            if not entity1:
                continue
            verb = entity1.head
            entity2 = SpacyEventExtractor._get_entity2(verb)

            if SpacyEventExtractor._is_present_simple(verb) or \
                    SpacyEventExtractor._is_present_continuous(verb):
                continue

            entity1_string = SpacyEventExtractor._get_full_entity(entity1)
            entity2_string = SpacyEventExtractor._get_full_entity(entity2)

            entity1_string = SpacyEventExtractor._replace_we(replace_we, entity1_string)
            entity2_string = SpacyEventExtractor._replace_we(replace_we, entity2_string)

            entity1_string = SpacyEventExtractor._remove_extra_whitespaces(entity1_string)
            entity2_string = SpacyEventExtractor._remove_extra_whitespaces(entity2_string)

            # if there is no keywords in token and subj_string
            if len(set([word.strip().lower() for word in entity1_string.split()]) & keywords_set) + \
                    len(set(word.strip().lower() for word in entity2_string.split()) & keywords_set) == 0:
                continue

            if SpacyEventExtractor._have_pronouns(entity1_string) or \
                    SpacyEventExtractor._have_pronouns(entity2_string):
                continue

            # entity2 can be empty only in some special cases like: IDEA 2.0 released
            if verb.lemma_.lower() not in SpacyEventExtractor._important_actions and entity2_string == "":
                continue

            action_string = SpacyEventExtractor._get_action(verb)
            event = Event(entity1_string, entity2_string, action_string, str(doc))
            events.append(event)

            print(event)

        return events
Esempio n. 33
0
# coding: utf-8
from __future__ import unicode_literals

import pytest

from spacy.en import English
from spacy.en import attrs

EN = English()


def test_attr_of_token():
    text = u'An example sentence.'
    tokens = EN(text)
    example = EN.vocab[u'example']
    assert example.orth != example.shape
    feats_array = tokens.to_array((attrs.ORTH, attrs.SHAPE))
    assert feats_array[0][0] != feats_array[0][1]


def test_tag():
    text = u'A nice sentence.'
    tokens = EN(text)
    assert tokens[0].tag != tokens[1].tag != tokens[2].tag != tokens[3].tag
    feats_array = tokens.to_array((attrs.ORTH, attrs.TAG))
    assert feats_array[0][1] == tokens[0].tag
    assert feats_array[1][1] == tokens[1].tag
    assert feats_array[2][1] == tokens[2].tag
    assert feats_array[3][1] == tokens[3].tag

Esempio n. 34
0
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None

    >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
    >>> arr, vocab = tokenize(sents, 10, merge=True)
    >>> arr.shape[0]
    2
    >>> arr.shape[1]
    10
    >>> w2i = {w: i for i, w in vocab.iteritems()}
    >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
    True
    >>> arr[0, 1] == w2i[u'you']
    True
    >>> arr[0, -1]  # last word in 0th document is a pad word
    -2
    >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
    True
    >>> arr[1, 1]  # The URL token is thrown out
    -2
    """
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(tqdm(list(nlp.pipe(texts, **kwargs)),
                                   desc="tokenizing")):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                # Iterate over named entities
                for ent in doc.ents:
                    if len(ent) > 1:
                        # Merge them into single tokens
                        ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab
Esempio n. 35
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 28 09:26:02 2017

@author: arlittr
"""

from spacy.en import English
parser = English()

import pandas as pd
from nltk.corpus import stopwords as stopwords
import networkx as nx
import string
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from math import inf
import hdbscan
from datetime import datetime


def cleanPassage(rawtext):
    #some code from https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

    #if data is bad, return empty
    if type(rawtext) is not str:
        return ''

    #split text with punctuation
Esempio n. 36
0
from spacy.en import English, LOCAL_DATA_DIR
import spacy.en
import os, time

data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(parser=False, tagger=True, entity=False)


def print_fine_pos(token):
    return token.tag_


def pos_tags(sentence):
    # sentence = str(sentence, "utf-8")
    # sentence = sentence.decode("utf-8")
    tokens = nlp(sentence)
    tags = []
    for tok in tokens:
        tags.append((tok, print_fine_pos(tok)))

    words = []
    for (pos, tag) in tags:
        words.append(pos.text)
    print(words)

    return tags


start = time.time()
a = "The dosa was brilliant and so was the samosa"
print(pos_tags(a))
Esempio n. 37
0
# Normalize text - This function requires modification and improvements
def clean_text(text):

    for i in range(0,len(mapOfWords)):
        if len(mapOfWords['Original'].iloc[i].split()) <= 1:
            text = re.sub( r'\b'+ mapOfWords['Original'].iloc[i] +'([.,\s]|$)', ' ' + mapOfWords['Map'].iloc[i] + ' ',text, flags=re.IGNORECASE)
        else:
            text = re.sub( r''+ mapOfWords['Original'].iloc[i] +'([.,\s]|$)', ' ' + mapOfWords['Map'].iloc[i] + ' ',text, flags=re.IGNORECASE)
    text = text.strip() # Trim string
    
    return text

# Some extra steps for normalization and removing unnecessary spaces and some improvements of observed issues

nlp = English() # required both for sentence tokenization and lemmatization
nlp.vocab.morphology.lemmatizer.exc[u'verb'][u'need'] = ('need',)
nlp.vocab.morphology.lemmatizer.exc[u'noun'][u'tier'] = ('tier',)
nlp.vocab.morphology.lemmatizer.exc[u'adj'][u'tier'] = ('tier',)

def calling_clean_text(text):
    text = re.sub( '\bcant\b','cannot',text, flags=re.IGNORECASE)
    text = re.sub( 'can\'t','cannot',text, flags=re.IGNORECASE)
    text = re.sub( 'i\'m','I am',text, flags=re.IGNORECASE)
    text = re.sub( 'won\'t','will not',text, flags=re.IGNORECASE)
    text = re.sub( 'n\'t',' not',text, flags=re.IGNORECASE)
    text = re.sub( '\'s','s',text, flags=re.IGNORECASE)
    text = re.sub( '\'ve',' have',text, flags=re.IGNORECASE)
    text = re.sub( '%',' percent ',text)
    text = re.sub(r'[^\w\s](?<![\-.,%\'])',' ',text)
    #text = re.sub(r'\w\s(?<![\-.,])',' ',text)
Esempio n. 38
0
    """
    Function that takes a text and returns an xml object containing the NAF.
    """
    doc = nlp(text)
    time = current_time()
    return naf_from_doc(doc, time=time)


def NAF_to_string(NAF, byte=False):
    """
    Function that takes an XML object containing NAF, and returns it as a string.
    If byte is True, then the output is a bytestring.
    """
    xml_string = etree.tostring(NAF, pretty_print=True, with_comments=True)
    if byte:
        return xml_string
    else:
        return xml_string.decode('utf-8')


# Command line functionality: given name of a file, process the file contents and
# print the NAF to stdout.
if __name__ == '__main__':
    import sys
    from spacy.en import English
    nlp = English()
    with open(sys.argv[1]) as f:
        text = f.read()
        NAF = text_to_NAF(text, nlp)
        print(NAF_to_string(NAF))
Esempio n. 39
0
def tokenize(texts,
             max_length,
             skip=-2,
             attr=LOWER,
             merge=False,
             nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None

    >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
    >>> arr, vocab = tokenize(sents, 10, merge=True)
    >>> arr.shape[0]
    2
    >>> arr.shape[1]
    10
    >>> w2i = {w: i for i, w in vocab.iteritems()}
    >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
    True
    >>> arr[0, 1] == w2i[u'you']
    True
    >>> arr[0, -1]  # last word in 0th document is a pad word
    -2
    >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
    True
    >>> arr[1, 1]  # The URL token is thrown out
    -2
    """
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                # Iterate over named entities
                for ent in doc.ents:
                    if len(ent) > 1:
                        # Merge them into single tokens
                        ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab