コード例 #1
0
 def __init__(self):
     self.data_preparator = DataPreparatorRelation(0, None, False)
     self.model = self.load_model()
     self.relations = self.load_relations()
     self.nlp = English()
     self.relation_types = [
         'adjs_nouns', 'verbs_adverbs', 'verbs_prepositions',
         'verbs_objects', 'verbs_dir_objects', 'subjects_verbs',
         'nouns_adjs'
     ]
コード例 #2
0
def to_nlp_objs(sentences):
    global nlp_parser
    # init once
    if (nlp_parser == None):
        nlp_parser = English()

    nlp_objs = []
    for s in sentences:
        nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False))
    return nlp_objs
コード例 #3
0
def cleaner(text):
    nlp = English()
    text = re.sub('<.>', ' ', text)
    text = re.sub('<..>', ' ', text)
    text = re.sub('\.+', ' ', text)
    text = re.sub('[^a-z0-9 ]','', text.lower())
    text = re.sub('\d+','NUMBER ',text)
    text = re.sub('\s+',' ',text)
    text = ' '.join(i.orth_ for i in nlp(text) if i.orth_ not in STOP_WORDS)
    return text
コード例 #4
0
ファイル: libner.py プロジェクト: lgood712/Cornell-Courses
 def sent2features(self):
     """Returns a function used to extract features from a sentence"""
     if self.features == 'spacy':
         global SPACY
         if not SPACY:
             print('loading spacy')
             from spacy.en import English
             SPACY = English(load_vectors=False)
             print('loaded SPACY')
         return spacy_sent2features
コード例 #5
0
def get_tags_of_sentence(text):
    parser = English()
    tokens = parser(text)
    the_terms = []
    the_pos = []
    for token in tokens:
        the_pos.append(token.pos_)
        the_terms.append(token.orth_)
        #for more detail: token.dep_,token.head
    return list(zip(the_terms, the_pos))
コード例 #6
0
ファイル: test_tokens_api.py プロジェクト: kengz/spaCy
def test_parse_tree(EN):
    text = 'I like New York in Autumn.'
    EN = English(parser=False)
    doc = EN(text, tag=True)
    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
    # full method parse_tree(text) is a trivial composition
    trees = doc.print_tree()
    assert len(trees) > 0
    tree = trees[0]
    assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
    assert tree['word'] == 'like' # check root is correct
コード例 #7
0
def testSVOs():
    nlp = English()

    # parcer = English()

    tok = nlp(
        "Find 2 numbers whose sum is 64 and whose difference is 4. What is the smaller number? What is the larger number?"
    )
    svos = findSVOs(tok)
    printDeps(tok)
    print(svos)
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-isTrain', type=int, default=1)
    args = parser.parse_args()

    nlp = English()  #used for conting number of tokens

    if args.isTrain == 1:
        annFile = '../data/mscoco_train2014_annotations.json'
        quesFile = '../data/OpenEnded_mscoco_train2014_questions.json'
        questions_file = open('../data/preprocessed/questions_train2014.txt',
                              'w')
        questions_lengths_file = open(
            '../data/preprocessed/questions_lengths_train2014.txt', 'w')
        answers_file = open('../data/preprocessed/answers_train2014.txt', 'w')
        coco_image_id = open('../data/preprocessed/images_train2014.txt', 'w')
        trainval = 'training data'
    else:
        annFile = '../data/mscoco_val2014_annotations.json'
        quesFile = '../data/OpenEnded_mscoco_val2014_questions.json'
        questions_file = open('../data/preprocessed/questions_val2014.txt',
                              'w')
        questions_lengths_file = open(
            '../data/preprocessed/questions_lengths_val2014.txt', 'w')
        answers_file = open('../data/preprocessed/answers_val2014.txt', 'w')
        coco_image_id = open('../data/preprocessed/images_val2014.txt', 'w')
        trainval = 'validation data'

    #initialize VQA api for QA annotations
    vqa = VQA(annFile, quesFile)
    questions = vqa.questions
    ques = questions['questions']
    qa = vqa.qa

    pbar = progressbar.ProgressBar()
    print 'Dumping questions,answers, imageIDs, and questions lenghts to text files...'
    for i, q in pbar(zip(xrange(1, len(ques) + 1), ques)):
        questions_file.write(q['question'].encode('utf8'))
        questions_file.write('\n'.encode('utf8'))
        questions_lengths_file.write(
            str(len(nlp(q['question']))).encode('utf8'))
        questions_lengths_file.write('\n'.encode('utf8'))

        coco_image_id.write(str(q['image_id']).encode('utf8'))
        coco_image_id.write('\n')
        if args.isTrain:
            answers_file.write(
                getModalAnswer(qa[q['question_id']]['answers']).encode('utf8'))
        else:
            answers_file.write(
                getAllAnswer(qa[q['question_id']]['answers']).encode('utf8'))
        answers_file.write('\n'.encode('utf8'))

    print 'completed dumping', trainval
コード例 #9
0
    def __init__(self, name='QueryDefinitionScorer', description='', short_name='qds', strategy='max'):
        """ If a question is of the form 'what is X' score the extent to which a single sentence
            in the answer is of the form 'X is ...'

            args:
                name, description, short_name (str): See QueryDocumentScorer
                strategy (str): The scoring strategy. Must be one of the following: 'max', 'average'
        """
        super(QueryDefinitionScorer, self).__init__(name=name, description=description, short_name=short_name)
        self.strategy = strategy
        self.nlp = English()
コード例 #10
0
    def __init__(self, fetcher, dry_run=False):

        self.fetcher = fetcher
        self.logger = logging.getLogger(__name__)
        self.parser = English()
        # A custom stoplist
        STOPLIST = set(
            nltk_stopwords.words('english') +
            ["n't", "'s", "'m", "ca", "p", "t"] + list(ENGLISH_STOP_WORDS))
        ALLOWED_STOPLIST = set(('non'))
        self.STOPLIST = STOPLIST - ALLOWED_STOPLIST
コード例 #11
0
    def substitute(self):
        NLP_Parser = English()
        summary = self.summary.split('<')[0]
        NLP_Tokens = NLP_Parser(summary)
        tokenized_summary = [token.orth_ for token in NLP_Tokens]
        substituted_summary = [word for word in tokenized_summary]
        for word in tokenized_summary:
            if word.lower() in config.substitutions:
                substituted_summary[substituted_summary.index(
                    word)] = config.substitutions[word]

        self.tweet_substituted = ' '.join(substituted_summary)
コード例 #12
0
ファイル: parse_wikipedia.py プロジェクト: Raldir/taxi
def main():
    """
    Creates a "knowledge resource" from triplets file
    """

    # Get the arguments
    args = docopt(
        """Parse the Wikipedia dump and create a triplets file, each line is formatted as follows: X\t\Y\tpath

    Usage:
        parse_wikipedia.py <in_file> <out_file>

        <in_file> = the Wikipedia dump file
        <out_file> = the output (parsed) file
    """)

    nlp = English()

    in_file = args['<in_file>']
    out_file = args['<out_file>']

    wrote_paths = 0

    with codecs.open(in_file, 'r') as f_in:
        with codecs.open(out_file, 'w') as f_out:
            with codecs.open(out_file + "_paths", 'w') as f_path_out:

                # Read the next paragraph
                for paragraph in f_in:

                    # Skip empty lines
                    paragraph = paragraph.replace("'''", '').strip()
                    paragraph = paragraph.decode("ascii",
                                                 errors="ignore").encode()
                    if len(paragraph) == 0:
                        continue

                    parsed_par = nlp(unicode(paragraph))

                    # Parse each sentence separately
                    for sent in parsed_par.sents:
                        dependency_paths = parse_sentence(sent)
                        if len(dependency_paths) > 0:
                            for dependency_triple in dependency_paths:
                                f_path_out.write('%s\n' % dependency_triple[2])
                                wrote_paths += 1

                            out = '\n'.join(
                                ['\t'.join(path) for path in dependency_paths])
                            print >> f_out, out

    print("Wrote %s paths to file: %s" % (wrote_paths, out_file + "_paths"))
    print("Finished processing file: %s" % in_file)
コード例 #13
0
    def __init__(self, **kwargs):
        self.nlp = English()
        self.window = kwargs.pop('window')
        self.nps_model_tag = kwargs.pop('nps_model_tag')

        log.info('Preparing sequence feature extractor... (window=%s)',
                 str(self.window))
        self.window_feature_extractors = [
            WordFeatures(),
            TagFeatures(),
        ]
        self.feature_extractors = [GazeteerFeatures(self.nps_model_tag)]
コード例 #14
0
ファイル: NPChunker.py プロジェクト: dakshvar22/DishingOut
    def __init__(self):
        self.pattern = """

                NP2: {<JJ.?>+ <RB>? <JJ.?>* <NN.?|FW>+ <VB.?>* <JJ.?>*}
                NP1: {<JJ.?>? <NN.?|FW>+ <CC>? <NN.?|FW>* <VB.?>? <RB.?>* <JJ.?>+ (<CC><JJ.?>)?}

            """
        self.st = None
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = RegexpTokenizer(r'\w+')
        self.nlp = English(parser=False, tagger=True, entity=False)
        self.exclude = set(string.punctuation)
コード例 #15
0
def main():
    nlp = English()
    texts = [
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
        u'Revenue exceeded twelve billion dollars, with a loss of $1b',
    ]

    for text in texts:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print(r1.text, r2.ent_type_)
コード例 #16
0
ファイル: svo.py プロジェクト: pranavkanade/Dump
def test_svo_s():
    nlp = English()
    tok = nlp(
        "For Halloween Debby and her sister "
        "combined the candy they received. "
        "Debby had 32 pieces of candy while her sister"
        " had 42. If they ate 35 pieces the first night, how many pieces do they have left?"
    )
    svos = find_svo_s(tok)
    print_dep_s(tok)  # prints dependencies
    print(svos)
    """EOF"""
コード例 #17
0
    def __init__(self):
        print('Loading GloVe data... ', end='', flush=True)
        self._nlp = English()
        # TODO(Bernhard): try word2vec instead of glove..
        print('Done.')

        # embedding_dims of glove
        embedding_dims = 300

        self._model = Sequential()
        self._model.add(
            Reshape(input_shape=(embedding_dims, ),
                    target_shape=(embedding_dims, )))
コード例 #18
0
    def __init__(self,
                 host='localhost',
                 port=27017,
                 db='python_import',
                 collection='earnings_call_Nas100_Broad_Manual_Update'
                 ):  # earnings_transcript, earnings_call_Dow30_Broad,
        self.client = pymongo.MongoClient(host=host, port=port)
        self.db = self.client[db]
        self.collection = self.db[collection]
        self.nlp = English()

        self.initialize_logging(db, host, port)
        self.initialize_dictionaries()
コード例 #19
0
def smm4h_inference_old(config, verbose=0, as_ensemble=True):

    print("Loading embedding model, this can take some time...")
    model = Wordsim.load_vector(config["embedd_model_path"])
    print("Loading spaCy model, this can take some time...")
    nlp = English()

    if config["checkpoint_file"] is not None:
        n_models = len(config["checkpoint_file"])
    else:
        n_models = len(config["run_name"])

    # Assume ensemble of models and run only first model if not
    ensemble_output_scores = []
    std = 0

    # Read lines
    with open(config["submission_out"], "w", encoding='utf-8') as out_file:
        count = 0
        with open(config["test_path"], 'r', encoding='utf-8') as in_file:
            for line in in_file:
                # Get data
                rows = line.strip().split("\t")
                text = rows[1]

                # Convert to embedding
                doc_vector = text_to_embeddings.text_to_vec(
                    model, nlp, text, config["embedding_dimension"],
                    config["document_length"])
                x_test = doc_vector.reshape(
                    1,
                    config["embedding_dimension"] * config["document_length"])

                # Dummy label
                y_test = np.empty([1, config["n_classes"]])

                # Get prediction
                for model_id in range(n_models):
                    eval_score, std = model_inference(as_ensemble, config,
                                                      ensemble_output_scores,
                                                      model_id, std, verbose,
                                                      x_test, y_test)

                mean_ensemble_scores = np.mean(
                    np.asarray(ensemble_output_scores), axis=0)
                y_pred = np.argmax(mean_ensemble_scores, 1)

                # Write
                out_file.write(line.strip() + "\t" + str(y_pred[0]) + "\n")
                count += 1
    return None
コード例 #20
0
def compute_block(block_no):
    reviews = pd.read_csv(
        '../amazon/tmp/reviews_electronics_block_{}.csv'.format(block_no))
    parser = English()
    result = []
    for idx in reviews.index:
        if isinstance(reviews.ix[idx, 'reviewText'], str):
            parsed_review = parser(reviews.ix[idx, 'reviewText'])
            counts = defaultdict(int)
            for word in parsed_review:
                if word.lemma_ in posneg.keys():
                    current_word = word
                    # find the head of all heads
                    while current_word.head is not current_word:
                        current_word = current_word.head
                    n_negations = check_for_negations(current_word)

                    weight = posneg[word.lemma_]
                    if n_negations % 2:
                        # means the statement was negated
                        weight *= -1
                    if weight < 0:
                        counts['count_negative'] += 1
                        counts['count_negative_weighted'] -= weight
                    else:
                        counts['count_positive'] += 1
                        counts['count_positive_weighted'] += weight
                if word.lemma_ in negations:
                    counts['count_negations'] += 1
            # this is a little uglier than writing to a dataframe immediately but _a lot_ faster
            result.append([
                reviews.ix[idx, 'reviewID'],
                " ".join([x.lemma_
                          for x in parsed_review]), counts['count_negations'],
                counts['count_negative'], counts['count_negative_weighted'],
                counts['count_positive'], counts['count_positive_weighted']
            ])
        else:
            result.append([
                reviews.ix[idx, 'reviewID'], None, None, None, None, None, None
            ])

    pd.DataFrame(
        result,
        columns=[
            'reviewID', 'reviewTextLemmatized', 'count_negations',
            'count_negative', 'count_negative_weighted', 'count_positive',
            'count_positive_weighted'
        ]).to_csv(
            '../amazon/tmp/reviews_electronics_block_{}_lemmatized.csv'.format(
                block_no))
コード例 #21
0
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size,
            min_ngram, max_ngram):
    logging.info('Processing batch_id: {}'.format(batch_id))
    subtrees = PreshCounter()
    subtrees_string_map = StringStore()
    noun_chunks = PreshCounter()
    noun_chunks_string_map = StringStore()

    if lang.lower() == "en":
        from spacy.en import English
        NLU = English()
        NLU.matcher = None
    elif lang.lower() == "id":
        from spacy.id import Indonesian
        NLU = Indonesian()
        NLU.matcher = None

    for i, doc in enumerate(
            NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)):
        phrases = set()
        for tok in doc:
            st_len = len(list(tok.subtree))
            if min_ngram <= st_len <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws)
                              for t in tok.subtree]).strip()
                orth = subtrees_string_map[st]
                subtrees.inc(orth, 1)
        for np in doc.noun_chunks:
            if min_ngram <= len(np) <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws) for t in np]).strip()
                orth = noun_chunks_string_map[st]
                noun_chunks.inc(orth, 1)

        if i % batch_size == 0:
            logging.info('Processing batch_id: {}, doc: {}'.format(
                batch_id, i))

    output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in subtrees:
            st = subtrees_string_map[orth]
            if count >= 5 and '!LONGWORD!' not in st:
                out.write('{}\t{}\n'.format(count, st))

    output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in noun_chunks:
            if count >= 5:
                st = noun_chunks_string_map[orth]
                out.write('{}\t{}\n'.format(count, st))
コード例 #22
0
ファイル: sent_count.py プロジェクト: crachmanin/UtahSquad
def main():
    parser = English()
    fn = "../train-v1.1.json"
    with open(fn) as fp:
        data = json.load(fp)

    sent_counts = Counter()
    for topic in data['data']:
        for pgraph in topic['paragraphs']:
            context = pgraph['context']
            num_sents = len(list(parser(context).sents))
            sent_counts[num_sents] += 1

    print sent_counts
コード例 #23
0
ファイル: jerblib.py プロジェクト: starkshift/jerbs
 def Spider(self,*argv):
     jobs = []
     nlp = English()
     self.data = list()
     for joburl in self.spider(self,*argv):
         self.data.append(dict())
         r = requests.get(joburl, cookies=self.cookies)
         soup = BeautifulSoup(r.text,"lxml")
         content = soup.body.find('div', attrs={'id':'ctl00_MainContent_PrimaryContent'})
         jobs.append(dict())
         for span in content.span.find_all('span',recursive=True):
               if span.has_attr('aria-labelledby'):
                     #print span['id'] + " : " + span.text
                     self.data[-1][span['id']] = span.getText(separator=u' ')
コード例 #24
0
def main():
    parser = English()
    fn = os.path.join(DATA_DIR, TRAIN_FILE)
    with open(fn) as fp:
        data = json.load(fp)

    result = []

    # data["data"][0]["paragraphs"][0]["qas"][0]["answers"][0]["text"]
    bad_count = 0
    for topic in data['data']:
        for pgraph in topic['paragraphs']:
            token_dict = {}
            context = pgraph['context']
            c_parsed = parser(context)
            sents = list(c_parsed.sents)
            for sent_num, sent in enumerate(sents):
                for token in sent:
                    token_dict[token.idx] = (token, sent_num, token.i - sent.start)

            for qa in pgraph['qas']:
                question = qa['question']
                q_parsed = parser(question)
                q_tokens = [token.orth_ for token in q_parsed]
                # e_question = str(question).encode('utf-8')
                for ans in qa['answers']:
                    answer_start = ans['answer_start']
                    answer_text = ans['text']
                    a_parsed = parser(answer_text)
                    a_tokens = [token.orth_ for token in a_parsed]
                    if answer_start in token_dict:
                        token, sent_num, answer_idx = token_dict[answer_start]
                        sent_tokens = [token.orth_ for token in sents[sent_num]]
                        end_idx = answer_idx + len(a_tokens) - 1
                        if end_idx >= len(sent_tokens) or sent_tokens[end_idx] != a_tokens[-1] or len(a_tokens) != 1:
                            # print str((c_tokens[end_idx], a_tokens[-1])).encode('utf-8')
                            # print str(c_tokens[answer_idx:answer_idx+ 5]).encode('utf-8')
                            # print str(a_tokens).encode('utf-8')
                            # print
                            bad_count += 1
                            continue
                        # line = [e_context, e_question, str(answer_idx)]
                        if answer_idx <= MAX_CLENGTH:
                            line = [sent_tokens, q_tokens, str(answer_idx), str(end_idx)]
                            result.append(line)

    out_fn = os.path.join(DATA_DIR, "sent-train-preproc.json")
    with open(out_fn, 'w') as train_fp:
        train_fp.write(json.dumps(result))
    print bad_count
コード例 #25
0
def main():
    nlp = English()
    texts = [
        u'Donald Trump issued the ban orders recently.'
        u'It prohibited people from 7 countries entering the US for 90 days.'
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
    ]

    for text in texts:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print(r1.text, r2.ent_type_, r2.text)
コード例 #26
0
def test_list_orphans():
    # Test case from NSchrading
    nlp = English(load_vectors=False)
    samples = ["a", "test blah wat okay"]
    lst = []
    for sample in samples:
        # Go through all the samples, call nlp() on each to get tokens,
        # pass those tokens to the _orphan_from_list() function, get a list back
        # and put all results in another list
        lst.extend(_orphan_from_list(nlp(sample)))
    # go through the list of all tokens and try to print orth_
    orths = ['a', 'test', 'blah', 'wat', 'okay']
    for i, l in enumerate(lst):
        assert l.orth_ == orths[i]
コード例 #27
0
ファイル: conll_train.py プロジェクト: slonik-az/spaCy
def main(train_loc, dev_loc, model_dir):
    with codecs.open(train_loc, 'r', 'utf8') as file_:
        train_sents = read_conll(file_)
    train(English, train_sents, model_dir)
    nlp = English(data_dir=model_dir)
    dev_sents = read_conll(open(dev_loc))
    scorer = Scorer()
    for _, sents in dev_sents:
        for annot_tuples, _ in sents:
            score_model(scorer, nlp, None, annot_tuples)
    print('TOK', 100 - scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
コード例 #28
0
	def __init__(self, name='ProperNounRatioScorer', short_name='pnrs', description='Proper Noun Ratio Scorer',
					nlp=None):
		"""
			Class that computes the ratio of proper nouns in a query
			The idea is that a query with a large fraction of proper nouns will tend to be a keyword query

			Args:
				name, short_name, description (str): See query_scorer.QueryScorer
				nlp (spacy.en.English): Tokenizes incoming text
		"""
		super(ProperNounRatioScorer, self).__init__(name=name, short_name=short_name, description=description)
		if nlp:
			self.nlp_ = nlp
		else:
			self.nlp_ = English()
コード例 #29
0
ファイル: run_spacy.py プロジェクト: yofayed/spacy-benchmarks
def main(giga_db_loc, n_docs, pos_tag=False, parse=False):
    docs = Gigaword(giga_db_loc, limit=n_docs)
    nlp = English()
    out_dir = '/tmp/spacy_out'
    if path.exists(out_dir):
        shutil.rmtree(out_dir)
    for i, doc in enumerate(docs):
        tokens = nlp(doc, tag=pos_tag, parse=parse)
        with codecs.open(path.join(out_dir, '%d.txt' % i), 'w',
                         'utf8') as file_:
            for sent in tokens.sents:
                for word in sent:
                    file_.write(
                        '%d\t%s\t%s\t%s\t%d' %
                        (word.orth_, word.tag_, word.head.i, word.dep_))
コード例 #30
0
def tokenize(text, lang):
	if lang == 'en':
		parser = English()
	elif lang == 'fr':
		parser = French()

	lda_tokens = []
	tokens = parser(text)
	# will split apostrophe 
	for token in tokens:
		if token.orth_.isspace():
			continue
		else:
			lda_tokens.append(token.lower_)
	return lda_tokens