def search_date(self, sen_id, sen, sen_lemma, sen_ner, keyword=None, dirc=None): date_candidates = [date for date in search_tag(sen, 'DATE', sen_ner)] if 'morning' in ' '.join(sen_lemma): date_candidates.append('morning') elif 'afternoon' in ' '.join(sen_lemma): date_candidates.append('afternoon') if keyword != None: if dirc == None: self.logger.debug( 'Error:\t |ipo_detect\tsearch_date|\tNeed search dirction for search keyword {}\n' .format(keyword)) elif dirc > 0: for date in date_candidates: if ' '.join(sen_lemma).index(keyword) > ' '.join( sen_lemma).index(date): date_candidates.remove(date) else: for date in date_candidates: if ' '.join(sen_lemma).index(keyword) < ' '.join( sen_lemma).index(date): date_candidates.remove(date) return date_candidates
def search_trade_info(self, sen_id, sen, sen_lemma, sen_ner, sen_entity): comp_ipo_score, ipo_tag = 0, None if 'trad' in sp.join(sen_lemma): # eleminate the intention sentence if re.compile("^.*(will|could|would|to)~\^~(\w{,7}~\^~)*trade" ).match(sp.join(sen_lemma)) != None: return 1, 'Upcoming' if re.compile( '.*(open|begin|start)~\^~(.*~\^~){,1}(trade|trading)~\^~(.*~\^~){,1}(on|at).*' ).match(sp.join(sen_lemma)) != None: comp_candidate = [ comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner) if comp not in financial_terms ] if len(comp_candidate) > 0: for comp in comp_candidate: if match_substring(comp, self.topic_company): comp_ipo_score += 1 ipo_tag = 'Trade' print 'Find IPO keyword | {} | in sentence{} for target company {}:\tget 1 score for IPO'.format( 'Begin Trade', sen_id, self.topic_company) break elif self.topic_company in ' '.join(sen_entity): comp_ipo_score += 1 ipo_tag = 'Trade' print 'Entity linked:' print 'Find IPO keyword | {} | in sentence{} for target company {}:\tget 1 score for IPO'.format( 'Begin Trade', sen_id, self.topic_company) return comp_ipo_score, ipo_tag
def match_comp_ticker(self, sen_id, sen, sen_lemma, sen_ner, sen_entity): comp_ipo_score = 0 for stock_code in IPO_Search.stock_codes: if stock_code in ' '.join(sen): searched_ticker = self.search_ticker(sen_id, sen, sen_ner, stock_code) if searched_ticker == '': continue elif searched_ticker.startswith(sp): entities = [ comp for comp in search_tag(sen, 'O', sen_entity) ] if len(entities) > 0: searched_ticker = entities[0] + searched_ticker elif self.topic_company in ' '.join(sen): searched_ticker = self.topic_company + searched_ticker else: self.logger.debug( 'Error:\t|ipo_detect\tmatch_comp_ticker|\tNo match company for {}. \t|{}\n' .format(searched_ticker, sen_id)) searched_ticker = 'None' + searched_ticker com_name = searched_ticker.split(sp)[0] if match_substring(com_name, self.topic_company): (intention, confid) = self.search_intention(sen_id, sen, sen_lemma) if intention == 'IPO': print 'Find ticker | {} | in sentence{} for target company {}:\tget 1 score for IPO.'.format( searched_ticker, sen_id, self.topic_company) comp_ipo_score += 1 else: searched_ticker += '*' yield (searched_ticker, comp_ipo_score)
def entity_coref_rsl(self): # predefined pronouns and definite articles pds = ['it', 'its', 'the company', 'the business', 'the firm'] # last_entity/last_entity_index are used to record the org entity in the subject part of prior sentence last_entity, last_entity_index = None, None # Iterate each sentence for j in xrange(0, len(self.words)): self.entity_coref[j] = ['O'] * len(self.words[j]) # Load NLP info for each sentence and sen_id, sen, sen_lemma = self.sen_ids[j], self.words[ j], self.lemmas[j] sen_ner, sen_pos, sen_tree_str = self.ners[j], self.pos_tags[ j], self.parse_trees[j] sen_depend = sen_depen(self.dependencies[j], len(sen), reverse=True) # self.logger.info('Sentence id: {} \n'.format(sen_id)) # find all candidates companies other than special entities like journal, stock exchange institutions comps = [(comp_index, comp) for ( comp_index, comp) in search_tag(sen, 'ORGANIZATION', sen_ner, index=True) if comp not in CR.special_terms] # using dependency feature to find any definite articles # like the XXX company and add them to the preps list for index in search_word('the', sen): (dep_index, tag) = sen_depend[index][0] if sen[dep_index] in ['company', 'business', 'firm']: new_pd = ' '.join(sen[index:dep_index + 1]) if new_pd not in pds: pds.append(new_pd) for pd in pds: pd_indices = [index for index in search_word(pd, sen)] for pd_index in pd_indices: if pd == 'it': # Skip sentence such as 'it's + adj, it's + noun.' (dep_index, tag) = sen_depend[pd_index][0] if sen[dep_index + 1] in ["to", "for", "that", "about", "because"]: if sen_lemma[dep_index] == 'be' or \ ('be' in sen_lemma[pd_index:dep_index] and (sen_pos[dep_index] == 'JJ' or sen_pos[dep_index].startswith('NN')) ): # self.logger.info('"it" is not recognized as pronoun, because of it + adj or noun.\n') continue if ',' in sen[pd_index:]: sen_tree, sen_tree_index, sen_structure = tree_position( sen_tree_str) if 'SBAR' in sen_structure[pd_index]: # self.logger.debug("Should find entity after pronoun/definite article {}.Sen_id:\t{}\n". # format(pd, sen_id)) pass # Resolve the situation when referent is supposed to locate after pronoun # that is when there is ':' or '--' followed by organization entity right after the pronoun flag = False dash_count = 0 if '--' in sen[pd_index:pd_index + 3] or ':' in sen[pd_index:pd_index + 3]: i = pd_index + 1 while i < len(sen) - 1: if (sen[i] == ':' or (sen[i] == '--' and dash_count%2 == 0)) \ and sen_ner[i+1] == 'ORGANIZATION': start = i + 1 while i < len( sen) and sen_ner[i] == 'ORGANIZATION': i += 1 referent = '{}@{}@{}'.format(j, start, i) update_coref_entity( pd_index, pd_index + len(pd.split(' ')), self.entity_coref[j], referent) flag = True if sen[i] == '--': dash_count += 1 i += 1 # if any situation above is triggered, skip this pronoun for prior organization detection if flag: # self.logger.info('Referent is assumed to locate after pronoun, because of ":" or "--".\n') pass # Code below is used to prior organization entity detection and linked with pronoun (np_start, np_end, verb_index) = self.search_dominated_np_for_pd( sen_tree_str, pd_index) if np_start is None: pass # self.logger.debug('Cannot Find directed dominated NP for pronoun {}, and assign subject entity \ # --{}-- of the prior sentence to this pronoun.\n'.format(pd, last_entity)) else: # Find any org entity located in the range (np_start, np_end) # in sentence or its co-reference array sub_entities = [ (comp_index, comp) for (comp_index, comp) in comps if comp_index <= np_end and np_start <= comp_index ] sub_entities += [ (comp_index, comp) for (comp_index, comp_end, comp) in search_tag(self.entity_coref[j], 'O', self.entity_coref[j], index=True) if comp_index <= np_end and np_start <= comp_index ] # If any, get the last one, which is assumed to be nearest one to the pronoun if len(sub_entities) > 0: (referent_index, refer_entity) = sub_entities[-1] # if '@' in refer_entity means the refer_entity is from the co-reference array, # just assign its value to the referent if '@' in refer_entity: referent = refer_entity else: # entity_index format is sen_id@start_index@end_index referent = '{}@{}@{}'.format( j, referent_index, referent_index + len(refer_entity.split(' '))) # update referent in the co-reference array update_coref_entity(pd_index, pd_index + len(pd.split(' ')), self.entity_coref[j], referent) continue else: pass # self.logger.debug('Cannot find entity for pronoun/definite article {} in dominated NP part,\ # and assign subject entity --{}-- of the prior sentence to this pronoun.\n'.format(pd, last_entity)) # if last entity is None, if last_entity is None: # if the sentence is the first sentence, use topic company instead if j == 0: last_entity = self.topic_company last_entity_index = self.topic_company + '*' # self.logger.info('Last entity is None and update it with topic company (1st sentence)!\n') else: pre_comps = [ (comp_index, comp) for (comp_index, comp) in search_tag(self.words[j - 1], 'ORGANIZATION', self.ners[j - 1], index=True) if comp not in CR.special_terms ] if len(pre_comps) > 0: comp_index, last_entity = pre_comps[0] last_entity_index = '{}@{}@{}'.format( j - 1, comp_index, comp_index + len(last_entity.split(' '))) # self.logger.info('Last entity is None and update last entity with organization \ # appeared in prior sentence: {}\n'.format(last_entity)) else: # self.logger.info('Last entity is None and update it with topic company!\n') last_entity = self.topic_company last_entity_index = self.topic_company + '*' # update referent in the co-reference array update_coref_entity(pd_index, pd_index + len(pd.split(' ')), self.entity_coref[j], last_entity_index) # Identify new organization entity in subject of the sentence and # update it to be the last entity (new_entity_index, new_entity) = self.search_subject_entity(sen_id, sen, sen_tree_str, sen_ner, comps) if new_entity: last_entity = new_entity last_entity_index = '{}@{}@{}'.format( sen_id.split('@')[1], new_entity_index, new_entity_index + len(new_entity.split(' '))) # self.logger.info('UPDATE LAST ENTITY to be {}\n'.format(sen_id, new_entity)) return self.entity_coref
def search_raise_fund(self, sen_id, sen, sen_lemma, sen_ner, sen_entity, sen_ipo_tags=None): comps = [ comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner) if comp not in financial_terms ] entities = [comp for comp in search_tag(sen, 'O', sen_entity)] if 'MONEY' in sen_ner: for (index, money) in search_tag(sen, 'MONEY', sen_ner, index=True): if money.endswith('illion'): confi = 0.2 comp_candidate = self.search_comp_with_flag(sen, sen_lemma, sen_entity, comps, entities, money, stem=False) if sen_ipo_tags == None: confi += 0.15 elif sen_id in sen_ipo_tags: confi += 0.3 if 'file' in sen_lemma and sen_lemma.index('file') > index: confi += 0.2 comp_candidate = self.search_comp_with_flag( sen, sen_lemma, sen_entity, comps, entities, 'file') elif 'raise' in sen_lemma: confi += 0.1 if ' '.join(sen_lemma).index('raise') < ' '.join( sen_lemma).index(money): comp_candidate = self.search_comp_with_flag( sen, sen_lemma, sen_entity, comps, entities, 'raise') elif 'raise by' in ' '.join(sen_lemma): comp_candidate = self.search_comp_with_flag( sen, sen_lemma, sen_entity, comps, entities, 'raise', dirc=1) if comp_candidate != None: yield sp.join([comp_candidate, money, str(confi)]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_raise_fund|\tCannot find company to match price {}.\t|{}\n' .format(money, sen_id)) # Count stock share elif 'shares' in sen and 'NUMBER' in sen_ner: for (index, number) in search_tag(sen, 'NUMBER', sen_ner, index=True): position = sen.index('shares') if index < position and position - index <= 4: comp_candidate = self.search_comp_with_flag( sen, sen_lemma, sen_entity, comps, entities, number) confi = 0.7 if comp_candidate != None: yield sp.join( [comp_candidate, number + ' shares', str(confi)]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_raise_fund|\tCannot find company to match stock share {}.\t|{}\n' .format(number + ' shares', sen_id))
def search_stock_price(self, sen_id, sen, sen_lemma, sen_ner, sen_entity): comps = [ comp for comp in search_tag(sen, 'ORGANIZATION', sen_ner) if comp not in financial_terms ] entities = [comp for comp in search_tag(sen, 'O', sen_entity)] if 'MONEY' in sen_ner: for (index, money) in search_tag(sen, 'MONEY', sen_ner, index=True): if money.endswith('illion'): continue if re.compile('^between \$ [0-9]+ and \$ [0-9]+$').match( ' '.join(sen[index - 4:index + 2])): continue if re.compile('^between \$ [0-9]+ and \$ [0-9]+$').match( ' '.join(sen[index - 1:index + 5])): money_range = ' '.join(sen[index - 1:index + 5]) comp_candidate = self.search_comp_with_flag(sen, sen_lemma, sen_entity, comps, entities, money_range, stem=False) if comp_candidate != None: yield sp.join([comp_candidate, money_range]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}\n' .format(money_range, sen_id)) elif re.compile( '.*(price|sell)~\^~([a-zA-Z0-9]*~\^~){,5}at.*').match( sp.join(sen_lemma)): flag_word = 'price' if 'price' in ' '.join( sen_lemma) else 'sell' if ' '.join(sen_lemma).index(flag_word) < ' '.join( sen_lemma).index(money): comp_candidate = self.search_comp_with_flag(sen, sen_lemma, sen_entity, comps, entities, flag_word, stem=False) if comp_candidate != None: yield sp.join([comp_candidate, money]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}\n' .format(money, sen_id)) elif 'per share' in ' '.join(sen) and ( ' '.join(sen).index(money) < ' '.join(sen).index('per share')): comp_candidate = self.search_comp_with_flag(sen, sen_lemma, sen_entity, comps, entities, 'per', stem=False) if comp_candidate != None: yield sp.join([comp_candidate, money]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}' .format(money, sen_id)) elif 'stock' in sen and 'price' in sen: comp_candidate = self.search_comp_with_flag( sen, sen_lemma, sen_entity, comps, entities, sen[index]) if comp_candidate != None: yield sp.join([comp_candidate, money]) else: self.logger.debug( 'Error:\t |ipo_detect\tsearch_stock_price|\tCannot find company to match price {}.\t|{}' .format(money, sen_id))
def search_ticker(self, sen_id, sen, sen_ner, stock_code): start = 0 index_flag = 0 keywords = ['ticker', 'symbol'] com_candidates = [ company for company in search_tag(sen, 'ORGANIZATION', sen_ner) ] flag = False symbol_code = '' if '-LRB- {} :'.format(stock_code) in ' '.join(sen): symbol_code = sen[sen.index('-RRB-') - 1] try: search_index = com_candidates.index(stock_code) if com_candidates.index(stock_code) > 0: return '{}~^~{}~^~0.9'.format( com_candidates[search_index - 1], symbol_code) else: self.logger.debug( 'Error\t: NER should have company before -LRB- {} : -RRB-.\t|{}\n' .format(stock_code, sen_id)) return '{}~^~{}~^~0.75'.format(sen[sen.index('-LRB-') - 1], symbol_code) except: self.logger.debug( "Error: \t {} is not in the company list.\t|{}\n".format( stock_code, sen_id)) for keyword in keywords: try: start = sen.index(keyword) index_flag = ' '.join(sen).index(keyword) break except ValueError: pass # if start == 0: # start = sen.index(stock_code) if start == 0: index_flag = ' '.join(sen).index(stock_code) symbol_code_distance = len(' '.join(sen)) for ii in xrange(start + 1, len(sen)): if re.compile('^[A-Z]+$').match( sen[ii]) != None and sen[ii] not in financial_terms: flag = True if symbol_code_distance > abs(' '.join(sen).index(sen[ii]) - index_flag): symbol_code = sen[ii] symbol_code_distance = abs(' '.join(sen).index(sen[ii]) - index_flag) if flag: try: search_index = com_candidates.index(stock_code) if search_index > 0: # print com_candidates[com_candidates.index(stock_code)-1],symbol_code if com_candidates[search_index - 1] in financial_terms: return '~^~{}~^~0.5'.format(symbol_code) return '{}~^~{}~^~0.8'.format( com_candidates[search_index - 1], symbol_code) else: # print stock_code,symbol_code return '~^~{}~^~0.5'.format(symbol_code) except ValueError: self.logger.debug( "Error:\t |ipo_detect\tsearch_ticker|\t{} is not in the company list.\t|{}\n" .format(stock_code, sen_id)) return ''