def __init__(self): self.data_preparator = DataPreparatorRelation(0, None, False) self.model = self.load_model() self.relations = self.load_relations() self.nlp = English() self.relation_types = [ 'adjs_nouns', 'verbs_adverbs', 'verbs_prepositions', 'verbs_objects', 'verbs_dir_objects', 'subjects_verbs', 'nouns_adjs' ]
def to_nlp_objs(sentences): global nlp_parser # init once if (nlp_parser == None): nlp_parser = English() nlp_objs = [] for s in sentences: nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False)) return nlp_objs
def cleaner(text): nlp = English() text = re.sub('<.>', ' ', text) text = re.sub('<..>', ' ', text) text = re.sub('\.+', ' ', text) text = re.sub('[^a-z0-9 ]','', text.lower()) text = re.sub('\d+','NUMBER ',text) text = re.sub('\s+',' ',text) text = ' '.join(i.orth_ for i in nlp(text) if i.orth_ not in STOP_WORDS) return text
def sent2features(self): """Returns a function used to extract features from a sentence""" if self.features == 'spacy': global SPACY if not SPACY: print('loading spacy') from spacy.en import English SPACY = English(load_vectors=False) print('loaded SPACY') return spacy_sent2features
def get_tags_of_sentence(text): parser = English() tokens = parser(text) the_terms = [] the_pos = [] for token in tokens: the_pos.append(token.pos_) the_terms.append(token.orth_) #for more detail: token.dep_,token.head return list(zip(the_terms, the_pos))
def test_parse_tree(EN): text = 'I like New York in Autumn.' EN = English(parser=False) doc = EN(text, tag=True) doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) # full method parse_tree(text) is a trivial composition trees = doc.print_tree() assert len(trees) > 0 tree = trees[0] assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers']) assert tree['word'] == 'like' # check root is correct
def testSVOs(): nlp = English() # parcer = English() tok = nlp( "Find 2 numbers whose sum is 64 and whose difference is 4. What is the smaller number? What is the larger number?" ) svos = findSVOs(tok) printDeps(tok) print(svos)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-isTrain', type=int, default=1) args = parser.parse_args() nlp = English() #used for conting number of tokens if args.isTrain == 1: annFile = '../data/mscoco_train2014_annotations.json' quesFile = '../data/OpenEnded_mscoco_train2014_questions.json' questions_file = open('../data/preprocessed/questions_train2014.txt', 'w') questions_lengths_file = open( '../data/preprocessed/questions_lengths_train2014.txt', 'w') answers_file = open('../data/preprocessed/answers_train2014.txt', 'w') coco_image_id = open('../data/preprocessed/images_train2014.txt', 'w') trainval = 'training data' else: annFile = '../data/mscoco_val2014_annotations.json' quesFile = '../data/OpenEnded_mscoco_val2014_questions.json' questions_file = open('../data/preprocessed/questions_val2014.txt', 'w') questions_lengths_file = open( '../data/preprocessed/questions_lengths_val2014.txt', 'w') answers_file = open('../data/preprocessed/answers_val2014.txt', 'w') coco_image_id = open('../data/preprocessed/images_val2014.txt', 'w') trainval = 'validation data' #initialize VQA api for QA annotations vqa = VQA(annFile, quesFile) questions = vqa.questions ques = questions['questions'] qa = vqa.qa pbar = progressbar.ProgressBar() print 'Dumping questions,answers, imageIDs, and questions lenghts to text files...' for i, q in pbar(zip(xrange(1, len(ques) + 1), ques)): questions_file.write(q['question'].encode('utf8')) questions_file.write('\n'.encode('utf8')) questions_lengths_file.write( str(len(nlp(q['question']))).encode('utf8')) questions_lengths_file.write('\n'.encode('utf8')) coco_image_id.write(str(q['image_id']).encode('utf8')) coco_image_id.write('\n') if args.isTrain: answers_file.write( getModalAnswer(qa[q['question_id']]['answers']).encode('utf8')) else: answers_file.write( getAllAnswer(qa[q['question_id']]['answers']).encode('utf8')) answers_file.write('\n'.encode('utf8')) print 'completed dumping', trainval
def __init__(self, name='QueryDefinitionScorer', description='', short_name='qds', strategy='max'): """ If a question is of the form 'what is X' score the extent to which a single sentence in the answer is of the form 'X is ...' args: name, description, short_name (str): See QueryDocumentScorer strategy (str): The scoring strategy. Must be one of the following: 'max', 'average' """ super(QueryDefinitionScorer, self).__init__(name=name, description=description, short_name=short_name) self.strategy = strategy self.nlp = English()
def __init__(self, fetcher, dry_run=False): self.fetcher = fetcher self.logger = logging.getLogger(__name__) self.parser = English() # A custom stoplist STOPLIST = set( nltk_stopwords.words('english') + ["n't", "'s", "'m", "ca", "p", "t"] + list(ENGLISH_STOP_WORDS)) ALLOWED_STOPLIST = set(('non')) self.STOPLIST = STOPLIST - ALLOWED_STOPLIST
def substitute(self): NLP_Parser = English() summary = self.summary.split('<')[0] NLP_Tokens = NLP_Parser(summary) tokenized_summary = [token.orth_ for token in NLP_Tokens] substituted_summary = [word for word in tokenized_summary] for word in tokenized_summary: if word.lower() in config.substitutions: substituted_summary[substituted_summary.index( word)] = config.substitutions[word] self.tweet_substituted = ' '.join(substituted_summary)
def main(): """ Creates a "knowledge resource" from triplets file """ # Get the arguments args = docopt( """Parse the Wikipedia dump and create a triplets file, each line is formatted as follows: X\t\Y\tpath Usage: parse_wikipedia.py <in_file> <out_file> <in_file> = the Wikipedia dump file <out_file> = the output (parsed) file """) nlp = English() in_file = args['<in_file>'] out_file = args['<out_file>'] wrote_paths = 0 with codecs.open(in_file, 'r') as f_in: with codecs.open(out_file, 'w') as f_out: with codecs.open(out_file + "_paths", 'w') as f_path_out: # Read the next paragraph for paragraph in f_in: # Skip empty lines paragraph = paragraph.replace("'''", '').strip() paragraph = paragraph.decode("ascii", errors="ignore").encode() if len(paragraph) == 0: continue parsed_par = nlp(unicode(paragraph)) # Parse each sentence separately for sent in parsed_par.sents: dependency_paths = parse_sentence(sent) if len(dependency_paths) > 0: for dependency_triple in dependency_paths: f_path_out.write('%s\n' % dependency_triple[2]) wrote_paths += 1 out = '\n'.join( ['\t'.join(path) for path in dependency_paths]) print >> f_out, out print("Wrote %s paths to file: %s" % (wrote_paths, out_file + "_paths")) print("Finished processing file: %s" % in_file)
def __init__(self, **kwargs): self.nlp = English() self.window = kwargs.pop('window') self.nps_model_tag = kwargs.pop('nps_model_tag') log.info('Preparing sequence feature extractor... (window=%s)', str(self.window)) self.window_feature_extractors = [ WordFeatures(), TagFeatures(), ] self.feature_extractors = [GazeteerFeatures(self.nps_model_tag)]
def __init__(self): self.pattern = """ NP2: {<JJ.?>+ <RB>? <JJ.?>* <NN.?|FW>+ <VB.?>* <JJ.?>*} NP1: {<JJ.?>? <NN.?|FW>+ <CC>? <NN.?|FW>* <VB.?>? <RB.?>* <JJ.?>+ (<CC><JJ.?>)?} """ self.st = None self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') self.nltk_tokenizer = RegexpTokenizer(r'\w+') self.nlp = English(parser=False, tagger=True, entity=False) self.exclude = set(string.punctuation)
def main(): nlp = English() texts = [ u'Net income was $9.4 million compared to the prior year of $2.7 million.', u'Revenue exceeded twelve billion dollars, with a loss of $1b', ] for text in texts: doc = nlp(text) relations = extract_currency_relations(doc) for r1, r2 in relations: print(r1.text, r2.ent_type_)
def test_svo_s(): nlp = English() tok = nlp( "For Halloween Debby and her sister " "combined the candy they received. " "Debby had 32 pieces of candy while her sister" " had 42. If they ate 35 pieces the first night, how many pieces do they have left?" ) svos = find_svo_s(tok) print_dep_s(tok) # prints dependencies print(svos) """EOF"""
def __init__(self): print('Loading GloVe data... ', end='', flush=True) self._nlp = English() # TODO(Bernhard): try word2vec instead of glove.. print('Done.') # embedding_dims of glove embedding_dims = 300 self._model = Sequential() self._model.add( Reshape(input_shape=(embedding_dims, ), target_shape=(embedding_dims, )))
def __init__(self, host='localhost', port=27017, db='python_import', collection='earnings_call_Nas100_Broad_Manual_Update' ): # earnings_transcript, earnings_call_Dow30_Broad, self.client = pymongo.MongoClient(host=host, port=port) self.db = self.client[db] self.collection = self.db[collection] self.nlp = English() self.initialize_logging(db, host, port) self.initialize_dictionaries()
def smm4h_inference_old(config, verbose=0, as_ensemble=True): print("Loading embedding model, this can take some time...") model = Wordsim.load_vector(config["embedd_model_path"]) print("Loading spaCy model, this can take some time...") nlp = English() if config["checkpoint_file"] is not None: n_models = len(config["checkpoint_file"]) else: n_models = len(config["run_name"]) # Assume ensemble of models and run only first model if not ensemble_output_scores = [] std = 0 # Read lines with open(config["submission_out"], "w", encoding='utf-8') as out_file: count = 0 with open(config["test_path"], 'r', encoding='utf-8') as in_file: for line in in_file: # Get data rows = line.strip().split("\t") text = rows[1] # Convert to embedding doc_vector = text_to_embeddings.text_to_vec( model, nlp, text, config["embedding_dimension"], config["document_length"]) x_test = doc_vector.reshape( 1, config["embedding_dimension"] * config["document_length"]) # Dummy label y_test = np.empty([1, config["n_classes"]]) # Get prediction for model_id in range(n_models): eval_score, std = model_inference(as_ensemble, config, ensemble_output_scores, model_id, std, verbose, x_test, y_test) mean_ensemble_scores = np.mean( np.asarray(ensemble_output_scores), axis=0) y_pred = np.argmax(mean_ensemble_scores, 1) # Write out_file.write(line.strip() + "\t" + str(y_pred[0]) + "\n") count += 1 return None
def compute_block(block_no): reviews = pd.read_csv( '../amazon/tmp/reviews_electronics_block_{}.csv'.format(block_no)) parser = English() result = [] for idx in reviews.index: if isinstance(reviews.ix[idx, 'reviewText'], str): parsed_review = parser(reviews.ix[idx, 'reviewText']) counts = defaultdict(int) for word in parsed_review: if word.lemma_ in posneg.keys(): current_word = word # find the head of all heads while current_word.head is not current_word: current_word = current_word.head n_negations = check_for_negations(current_word) weight = posneg[word.lemma_] if n_negations % 2: # means the statement was negated weight *= -1 if weight < 0: counts['count_negative'] += 1 counts['count_negative_weighted'] -= weight else: counts['count_positive'] += 1 counts['count_positive_weighted'] += weight if word.lemma_ in negations: counts['count_negations'] += 1 # this is a little uglier than writing to a dataframe immediately but _a lot_ faster result.append([ reviews.ix[idx, 'reviewID'], " ".join([x.lemma_ for x in parsed_review]), counts['count_negations'], counts['count_negative'], counts['count_negative_weighted'], counts['count_positive'], counts['count_positive_weighted'] ]) else: result.append([ reviews.ix[idx, 'reviewID'], None, None, None, None, None, None ]) pd.DataFrame( result, columns=[ 'reviewID', 'reviewTextLemmatized', 'count_negations', 'count_negative', 'count_negative_weighted', 'count_positive', 'count_positive_weighted' ]).to_csv( '../amazon/tmp/reviews_electronics_block_{}_lemmatized.csv'.format( block_no))
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size, min_ngram, max_ngram): logging.info('Processing batch_id: {}'.format(batch_id)) subtrees = PreshCounter() subtrees_string_map = StringStore() noun_chunks = PreshCounter() noun_chunks_string_map = StringStore() if lang.lower() == "en": from spacy.en import English NLU = English() NLU.matcher = None elif lang.lower() == "id": from spacy.id import Indonesian NLU = Indonesian() NLU.matcher = None for i, doc in enumerate( NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)): phrases = set() for tok in doc: st_len = len(list(tok.subtree)) if min_ngram <= st_len <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in tok.subtree]).strip() orth = subtrees_string_map[st] subtrees.inc(orth, 1) for np in doc.noun_chunks: if min_ngram <= len(np) <= max_ngram: st = ''.join([rep_text(t.text_with_ws) for t in np]).strip() orth = noun_chunks_string_map[st] noun_chunks.inc(orth, 1) if i % batch_size == 0: logging.info('Processing batch_id: {}, doc: {}'.format( batch_id, i)) output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in subtrees: st = subtrees_string_map[orth] if count >= 5 and '!LONGWORD!' not in st: out.write('{}\t{}\n'.format(count, st)) output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id)) with io.open(output_fname, 'w', encoding='utf-8') as out: for orth, count in noun_chunks: if count >= 5: st = noun_chunks_string_map[orth] out.write('{}\t{}\n'.format(count, st))
def main(): parser = English() fn = "../train-v1.1.json" with open(fn) as fp: data = json.load(fp) sent_counts = Counter() for topic in data['data']: for pgraph in topic['paragraphs']: context = pgraph['context'] num_sents = len(list(parser(context).sents)) sent_counts[num_sents] += 1 print sent_counts
def Spider(self,*argv): jobs = [] nlp = English() self.data = list() for joburl in self.spider(self,*argv): self.data.append(dict()) r = requests.get(joburl, cookies=self.cookies) soup = BeautifulSoup(r.text,"lxml") content = soup.body.find('div', attrs={'id':'ctl00_MainContent_PrimaryContent'}) jobs.append(dict()) for span in content.span.find_all('span',recursive=True): if span.has_attr('aria-labelledby'): #print span['id'] + " : " + span.text self.data[-1][span['id']] = span.getText(separator=u' ')
def main(): parser = English() fn = os.path.join(DATA_DIR, TRAIN_FILE) with open(fn) as fp: data = json.load(fp) result = [] # data["data"][0]["paragraphs"][0]["qas"][0]["answers"][0]["text"] bad_count = 0 for topic in data['data']: for pgraph in topic['paragraphs']: token_dict = {} context = pgraph['context'] c_parsed = parser(context) sents = list(c_parsed.sents) for sent_num, sent in enumerate(sents): for token in sent: token_dict[token.idx] = (token, sent_num, token.i - sent.start) for qa in pgraph['qas']: question = qa['question'] q_parsed = parser(question) q_tokens = [token.orth_ for token in q_parsed] # e_question = str(question).encode('utf-8') for ans in qa['answers']: answer_start = ans['answer_start'] answer_text = ans['text'] a_parsed = parser(answer_text) a_tokens = [token.orth_ for token in a_parsed] if answer_start in token_dict: token, sent_num, answer_idx = token_dict[answer_start] sent_tokens = [token.orth_ for token in sents[sent_num]] end_idx = answer_idx + len(a_tokens) - 1 if end_idx >= len(sent_tokens) or sent_tokens[end_idx] != a_tokens[-1] or len(a_tokens) != 1: # print str((c_tokens[end_idx], a_tokens[-1])).encode('utf-8') # print str(c_tokens[answer_idx:answer_idx+ 5]).encode('utf-8') # print str(a_tokens).encode('utf-8') # print bad_count += 1 continue # line = [e_context, e_question, str(answer_idx)] if answer_idx <= MAX_CLENGTH: line = [sent_tokens, q_tokens, str(answer_idx), str(end_idx)] result.append(line) out_fn = os.path.join(DATA_DIR, "sent-train-preproc.json") with open(out_fn, 'w') as train_fp: train_fp.write(json.dumps(result)) print bad_count
def main(): nlp = English() texts = [ u'Donald Trump issued the ban orders recently.' u'It prohibited people from 7 countries entering the US for 90 days.' u'Net income was $9.4 million compared to the prior year of $2.7 million.', u'Revenue exceeded twelve billion dollars, with a loss of $1b.', ] for text in texts: doc = nlp(text) relations = extract_currency_relations(doc) for r1, r2 in relations: print(r1.text, r2.ent_type_, r2.text)
def test_list_orphans(): # Test case from NSchrading nlp = English(load_vectors=False) samples = ["a", "test blah wat okay"] lst = [] for sample in samples: # Go through all the samples, call nlp() on each to get tokens, # pass those tokens to the _orphan_from_list() function, get a list back # and put all results in another list lst.extend(_orphan_from_list(nlp(sample))) # go through the list of all tokens and try to print orth_ orths = ['a', 'test', 'blah', 'wat', 'okay'] for i, l in enumerate(lst): assert l.orth_ == orths[i]
def main(train_loc, dev_loc, model_dir): with codecs.open(train_loc, 'r', 'utf8') as file_: train_sents = read_conll(file_) train(English, train_sents, model_dir) nlp = English(data_dir=model_dir) dev_sents = read_conll(open(dev_loc)) scorer = Scorer() for _, sents in dev_sents: for annot_tuples, _ in sents: score_model(scorer, nlp, None, annot_tuples) print('TOK', 100 - scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las)
def __init__(self, name='ProperNounRatioScorer', short_name='pnrs', description='Proper Noun Ratio Scorer', nlp=None): """ Class that computes the ratio of proper nouns in a query The idea is that a query with a large fraction of proper nouns will tend to be a keyword query Args: name, short_name, description (str): See query_scorer.QueryScorer nlp (spacy.en.English): Tokenizes incoming text """ super(ProperNounRatioScorer, self).__init__(name=name, short_name=short_name, description=description) if nlp: self.nlp_ = nlp else: self.nlp_ = English()
def main(giga_db_loc, n_docs, pos_tag=False, parse=False): docs = Gigaword(giga_db_loc, limit=n_docs) nlp = English() out_dir = '/tmp/spacy_out' if path.exists(out_dir): shutil.rmtree(out_dir) for i, doc in enumerate(docs): tokens = nlp(doc, tag=pos_tag, parse=parse) with codecs.open(path.join(out_dir, '%d.txt' % i), 'w', 'utf8') as file_: for sent in tokens.sents: for word in sent: file_.write( '%d\t%s\t%s\t%s\t%d' % (word.orth_, word.tag_, word.head.i, word.dep_))
def tokenize(text, lang): if lang == 'en': parser = English() elif lang == 'fr': parser = French() lda_tokens = [] tokens = parser(text) # will split apostrophe for token in tokens: if token.orth_.isspace(): continue else: lda_tokens.append(token.lower_) return lda_tokens