def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer()
class BM25(Feature): def __init__(self, index_name): ''' default constuctor Args: index_name(str): The elasticsearch index name that will be used to retrieve documents and idfs ''' self.es_int = ESInterface(index_name=index_name) print self.es_int.get_avg_size('sentence') self.avg_doc_length = -1 def extract(self, query, document, stem=True, no_stopwords=True, b=0.75, k1=1.25): ''' Args: query(str) document(str) stem(bool) no_stopwords(bool) b(float): Controls to what degree document length normalizes tf values. k1(float): Controls non-linear term frequency normalization ''' q_terms = list( set([ w for w in self.tokenize( query, stem=stem, no_stopwords=no_stopwords) ])) d_terms = list( set([ w for w in self.tokenize( document, stem=stem, no_stopwords=no_stopwords) ])) d_len = len(self.tokenize(document, stem=False, no_stopwords=False)) if self.avg_doc_length == -1: self.avg_doc_length = self.es_int.get_avg_size('sentence') score = 0 for t in q_terms: score += self.es_int.get_idf(t) *\ ((self._freq(t, d_terms) * (k1 + 1)) / (self._freq(t, d_terms) + k1 * (1 - b + b * (d_len / avg_doc_length)))) return score def _freq(self, term, doc): ''' Gets the frequency of a term in a doc Args: term(str) doc(list(str)) -- list of strings ''' return len([1 for t in doc if t == term])
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def __init__(self, index_name=constants.bm25_index): ''' default constuctor Args: index_name(str): The elasticsearch index name that will be used to retrieve documents and idfs ''' self.es_int = ESInterface(index_name=index_name) print self.es_int.get_avg_size('sentence') self.avg_doc_length = -1
def dump_stats(dump_stats_data, dump_path, index_name): es_int = ESInterface(index_name=index_name) csv_line = [] for ann, res in dump_stats_data: csv_line.extend([[ ann['topic_id'], ann['citing_article'][:-4].lower(), ann['reference_article'][:-4].lower(), ann['discourse_facet'] ], [''], [ann['citation_text'].encode('ascii', 'ignore')], ['']]) offsets = chain(*[[s[0], s[1], ''] for s in sorted(ann['reference_offset'].keys(), key=lambda t: t[0])]) csv_line.extend([list(offsets), ['']]) csv_line.append(['prec:']) csv_line.extend([list(t) for t in calculate_ap([res], [ann]).items()]) csv_line.append(['ndcg:']) csv_line.extend( [list(t) for t in calculate_ndcg([res], [ann]).items()]) csv_line.append(['']) for i, r in enumerate(res, start=1): rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper() # temp until Arman fixes bug txt = es_int.get_page_by_res(r)['sentence'].encode( 'ascii', 'ignore') offset = str( es_int.get_page_by_res(r)['offset']).strip('()').split(', ') csv_line.extend([[txt], [ 'rank', i, '', 'offset', offset[0], offset[1], '', 'rel?', rel ]]) # commented until bugs fixed # txt = [] # for offset in r['offset']: # txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]), # r['_type'].split('_')[2], offset)) # txt = ' ... '.join(txt) # csv_line.extend([[txt], ['rank', i, '', 'offset', # r['offset'][0][0], r['offset'][0][1], # '', 'rel?', rel]]) # csv_line.append(['']) csv_line.extend([[''], ['']]) with file(dump_path, 'wb') as csv_file: wr = csv.writer(csv_file) wr.writerows(csv_line)
def dump_stats(dump_stats_data, dump_path, index_name): es_int = ESInterface(index_name=index_name) csv_line = [] for ann, res in dump_stats_data: csv_line.extend([[ann['topic_id'], ann['citing_article'][:-4].lower(), ann['reference_article'][:-4].lower(), ann['discourse_facet']], [''], [ann['citation_text'].encode('ascii', 'ignore')], ['']]) offsets = chain(*[[s[0], s[1], ''] for s in sorted(ann['reference_offset'].keys(), key=lambda t: t[0])]) csv_line.extend([list(offsets), ['']]) csv_line.append(['prec:']) csv_line.extend([list(t) for t in calculate_ap([res], [ann]).items()]) csv_line.append(['ndcg:']) csv_line.extend([list(t) for t in calculate_ndcg([res], [ann]).items()]) csv_line.append(['']) for i, r in enumerate(res, start=1): rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper() # temp until Arman fixes bug txt = es_int.get_page_by_res( r)['sentence'].encode('ascii', 'ignore') offset = str(es_int.get_page_by_res(r)['offset']).strip( '()').split(', ') csv_line.extend([[txt], ['rank', i, '', 'offset', offset[0], offset[1], '', 'rel?', rel]]) # commented until bugs fixed # txt = [] # for offset in r['offset']: # txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]), # r['_type'].split('_')[2], offset)) # txt = ' ... '.join(txt) # csv_line.extend([[txt], ['rank', i, '', 'offset', # r['offset'][0][0], r['offset'][0][1], # '', 'rel?', rel]]) # csv_line.append(['']) csv_line.extend([[''], ['']]) with file(dump_path, 'wb') as csv_file: wr = csv.writer(csv_file) wr.writerows(csv_line)
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([])
class BM25(Feature): def __init__(self, index_name): ''' default constuctor Args: index_name(str): The elasticsearch index name that will be used to retrieve documents and idfs ''' self.es_int = ESInterface(index_name=index_name) print self.es_int.get_avg_size('sentence') self.avg_doc_length = -1 def extract(self, query, document, stem=True, no_stopwords=True, b=0.75, k1=1.25): ''' Args: query(str) document(str) stem(bool) no_stopwords(bool) b(float): Controls to what degree document length normalizes tf values. k1(float): Controls non-linear term frequency normalization ''' q_terms = list(set([w for w in self.tokenize( query, stem=stem, no_stopwords=no_stopwords)])) d_terms = list(set([w for w in self.tokenize( document, stem=stem, no_stopwords=no_stopwords)])) d_len = len(self.tokenize(document, stem=False, no_stopwords=False)) if self.avg_doc_length == -1: self.avg_doc_length = self.es_int.get_avg_size('sentence') score = 0 for t in q_terms: score += self.es_int.get_idf(t) *\ ((self._freq(t, d_terms) * (k1 + 1)) / (self._freq(t, d_terms) + k1 * (1 - b + b * (d_len / avg_doc_length)))) return score def _freq(self, term, doc): ''' Gets the frequency of a term in a doc Args: term(str) doc(list(str)) -- list of strings ''' return len([1 for t in doc if t == term])
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient()
def __init__(self, filter_list=None, base_cache=None, cachedir='cache', eshost=None, esport=None, esindex=None, sim_func=CosineSimilarity(), stopwords=None, weighted=False, query_terms_only=False): if not eshost: eshost = 'localhost' if not esport: esport = 9200 if not esindex: esindex = 'pubmed' self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.cachedir = cachedir self.sim_func = sim_func self.timer = Timer(prefix='[timer]') self.weighted = weighted self.query_terms_only = query_terms_only self.base_cache = base_cache if not stopwords: stopwords = set() self._stopwords = stopwords if filter_list: filter_list = set([e for e in filter_list if e not in stopwords]) self._filter_list = filter_list # calculate figerprint to use as cache comment! finger_text = ' '.join([ w for w in set.union((self._filter_list or set()), self._stopwords) ]) finger_md5 = md5() finger_md5.update(finger_text.encode('utf-8')) self.finger_filter = finger_md5.hexdigest()
def __init__(self, index_name): ''' default constuctor Args: index_name(str): The elasticsearch index name that will be used to retrieve documents and idfs ''' self.es_int = ESInterface(index_name=index_name) print self.es_int.get_avg_size('sentence') self.avg_doc_length = -1
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT')} self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def __init__(self, filter_list=None, base_cache=None, cachedir='cache', eshost=None, esport=None, esindex=None, sim_func=CosineSimilarity(), stopwords=None, weighted=False, query_terms_only=False): if not eshost: eshost = 'localhost' if not esport: esport = 9200 if not esindex: esindex = 'pubmed' self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.cachedir = cachedir self.sim_func = sim_func self.timer = Timer(prefix='[timer]') self.weighted = weighted self.query_terms_only = query_terms_only self.base_cache = base_cache if not stopwords: stopwords = set() self._stopwords = stopwords if filter_list: filter_list = set([e for e in filter_list if e not in stopwords]) self._filter_list = filter_list # calculate figerprint to use as cache comment! finger_text = ' '.join([w for w in set.union((self._filter_list or set()), self._stopwords)]) finger_md5 = md5() finger_md5.update(finger_text.encode('utf-8')) self.finger_filter = finger_md5.hexdigest()
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def __init__(self, documents, eshost='localhost', esport=9200, esindex='pubmed21', cachedir='cache'): self.cachedir = cachedir self.questions = documents self.categories = None self.added = dict([(qid, []) for qid in self.questions.keys()]) self.removed = dict([(qid, []) for qid in self.questions.keys()]) self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.tokenquestions = self.tokenize_questions(self.questions.items()) self.tokquestions = dict([(k, " ".join(v)) for k, v in self.tokenquestions.iteritems()]) self.run()
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 3 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'analyzer': { 'default': False, 'type': str }, 'ngram': { 'default': False, 'type': int } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) def run(self, test_data): # with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf: # json.dump(test_data, mf, indent=2) out_results = [] det_res = {} for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('train', 'eval') doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('eval', 'train') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') # tokens = self.es_int.tokenize(q, "sentence") tokens = self.tokenizer.tokenize(q) tokens = ['"' + t + '"' if '-' in t else t for t in tokens] q = ' '.join([ t for t in tokens if (t not in self.stopwords and t not in authors and not (self.all_digits(t))) ]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() # q = '*:*' if self.opts.analyzer: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q e['topic'] = ann['topic_id'].lower() if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) # with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf: # json.dump(out_results, mf, indent=2) # sys.exit() return out_results
def __init__(self, cache_index='cache'): self.es_int = ESInterface(index_name=cache_index)
class InsertionRankHelper(object): """docstring for InsertionRankHelper""" def __init__(self, filter_list=None, base_cache=None, cachedir='cache', eshost=None, esport=None, esindex=None, sim_func=CosineSimilarity(), stopwords=None, weighted=False, query_terms_only=False): if not eshost: eshost = 'localhost' if not esport: esport = 9200 if not esindex: esindex = 'pubmed' self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.cachedir = cachedir self.sim_func = sim_func self.timer = Timer(prefix='[timer]') self.weighted = weighted self.query_terms_only = query_terms_only self.base_cache = base_cache if not stopwords: stopwords = set() self._stopwords = stopwords if filter_list: filter_list = set([e for e in filter_list if e not in stopwords]) self._filter_list = filter_list # calculate figerprint to use as cache comment! finger_text = ' '.join([w for w in set.union((self._filter_list or set()), self._stopwords)]) finger_md5 = md5() finger_md5.update(finger_text.encode('utf-8')) self.finger_filter = finger_md5.hexdigest() def _in_filter_list(self, elem): if elem in self._stopwords: return False try: return (elem in self._filter_list) except TypeError: return True @simple_caching() def _get_docs_content_insrank(self, results): for res in results: content = self.es.get_page(res['id'], res['type']) content.pop('references', None) res['content'] = [e[1] for e in content.items()] return results @simple_caching() def _tokenize_and_expand(self, qid, question, results): """ Takes care of tokenizing the question/results and expanding them using one of the four dictionary expansion methods. """ cache_comment = self.base_cache + qid results = self._get_docs_content_insrank(results, cache_comment=cache_comment) docs = {r['id']: unicode(r['content']).replace(':', ' ') for r in results} docs[qid] = question # filters out terms docs = {did: ' '.join([w for w in doc.split() if self._in_filter_list(w)]) for did, doc in docs.items()} return self.exp_method(docs).objout def get_docs(self, qid, question, results): cache_comment = (self.base_cache + '{0}_{1}_{2}'.format(qid, self.exp_method.__name__, self.finger_filter)) docs = self._tokenize_and_expand(qid, question, results, cache_comment=cache_comment) question = Document(qid, 0, docs.pop(qid).split(), float('inf'), 'query', float('inf'), 0) doc_results = [] for res in results: res['tokens'] = docs[res['id']].split() # eliminates tokens that are not part of the # question if specified by argument query_terms_only if self.query_terms_only: res['tokens'] = [t for t in res['tokens'] if t in question.terms] doc_results.append(Document(res['id'], res['rank'], res['tokens'], res['relevance'], res['type'], res['score'], res['rank'], weighted=self.weighted)) return question, doc_results def _swap_position(self, pos_list, posA, posB): elA = pos_list[posA] elB = pos_list[posB] elA.rank = posB + 1 elB.rank = posA + 1 pos_list[posB] = elA pos_list[posA] = elB return True def _is_swappable(self, doc, new_rank): shift = int(fabs(doc.original_rank - new_rank)) if shift > self.max_rerank_pos: return False else: return True def rerank(self, qid, question, results, exp_method, max_rerank_pos=None, training_mode=False): """ Performs reranking """ # Retrieves dynamically the methods in expansion_methods # using inspect module. methods = inspect.getmembers(expansion_methods, inspect.isclass) methods = [str(e[1]).split('.') for e in methods] methods = [e[len(e) - 1] for e in methods] # tries to load such method from expansion_methods. If it # fails, it terminates with status 1 try: self.exp_method = getattr(expansion_methods, exp_method) except AttributeError: print >> sys.stderr, ('[error] {m} is not a valid method: ' + 'use {l}.').format(m=exp_method, l=', '.join(methods)) sys.exit(1) # if no maximum number of shifts is set, it lets # results move up/down as much as they want if not(max_rerank_pos): max_rerank_pos = len(results) self.max_rerank_pos = max_rerank_pos # true if at least a pair of elements have been swapped # or if is before first iteration swap_flag = True self.timer('expansion query {qid}'.format(qid=qid), quiet=True) question, docs = self.get_docs(qid, question, results) self.timer('expansion query {qid}'.format(qid=qid), quiet=training_mode) self.timer('reranking {qid}'.format(qid=qid), quiet=True) while swap_flag: swap_flag = False for (i, j) in [(i, i + 1) for i in range(len(docs) - 1)]: sim_i = self.sim_func(question, docs[i]) sim_j = self.sim_func(question, docs[j]) if (sim_j > sim_i and self._is_swappable(docs[i], j + 1) and self._is_swappable(docs[j], i + 1)): self._swap_position(docs, i, j) swap_flag = True self.timer('reranking {qid}'.format(qid=qid), quiet=training_mode) if not training_mode: # calculate and print statistics on # of shifts rankvals = np.array([fabs(d.original_rank - d.rank) for d in docs]) msg = '[info] shift avg: {:.2f}\t shift stdev: {:.2f}' print msg.format(rankvals.mean(), rankvals.std()) out = [{'id': d.id, 'score': d.score, 'rank': d.rank, 'relevance': d.relevance, 'original_rank': d.original_rank} for d in sorted(docs, key=lambda o: o.rank)] return out
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'analyzer': { 'default': False, 'type': str }, 'ngram': { 'default': False, 'type': int }, 'concept_boost': { 'default': 3, 'type': int }, 'np_boost': { 'default': 3, 'type': int }, 'sent_boost': { 'default': 1, 'type': int }, 'stem_boost': { 'default': 1, 'type': int }, 'runmode': { 'default': 'train' } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer() # if len(args) > 3:s # self.ttys = [] # # for tty in args[3:]: # if tty in ttygroups: # self.ttys.extend(ttygroups[tty]) # else: # self.ttys.append(tty) def expand_concept(self, cdata, synonyms=False): rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'} Okay = True for st in cdata['SemanticTypes']: if st in rejected_semTypes: Okay = False if Okay: if synonyms: return self.concept_synonyms(cdata['ConceptId']) else: return cdata['ConceptId'] def concept_synonyms(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set( filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def run(self, test_data): out_results = [] for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME if self.opts.runmode == 'eval': doc_type = doc_type.replace('train', 'eval') doc = self.doc_mod.get_doc(ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ann['citation_marker_offset'][0]: ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ ann['citation_marker_offset'][0] - ann['citation_offset'][0], ann['citation_marker_offset'][1] - ann['citation_offset'][0] ] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') ''' -------------- IMMEDIATE NP BEFORE MARKER ---------- ''' m = list(self.reg_apa.finditer(cit_text_doc)) m1 = list(self.reg_ieee.finditer(cit_text_doc)) m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # (start, end, group) if len(m) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m] elif len(m1) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m1] elif len(m2) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m2] else: m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) if len(m3) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m3] else: markers = [] if len(markers) > 10000: nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip()) if nps is None: q = cleaned else: t = nps.split(' ') concepts = [] for i in range(len(t)): conc = [] toks = t[i].split('/') while (('NP' in toks[2]) and (i < len(t))): conc.append((toks[0], toks[6])) i += 1 if i < len(t): toks = t[i].split('/') if len(conc) > 0: concepts.append(conc) noun_phrases = [ ' '.join([s1[0] for s1 in t1]) for t1 in concepts ] # nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # nps = [a[1:-1] for sent in nps for piece in sent for a in piece] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] tokens = self.tokenizer.tokenize(cit_text) tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) nearest = '' nearest_idx = -1 distance = 100000 # find nearest word to the citation marker for idx, f in enumerate(tokens_offsets): # check to see if in valid span (not citation markers) invalid = False for e in markers: if f[0] >= e[0] and f[1] <= e[1]: invalid = True if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ (cit_mrk_offset_sent[0] - f[1] < distance) and\ not invalid: distance = cit_mrk_offset_sent[0] - f[1] if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: nearest = tokens[idx] if (idx > 0) and len( re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0: nearest = tokens[idx - 1] + ' ' + tokens[idx] nearest_idx = idx elif (cit_mrk_offset_sent[0] < f[1]): break if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\ tokens[nearest_idx] not in stops100: nearest = tokens[idx - 1] + ' ' + tokens[idx] largest = 0 q = '' for n in noun_phrases: if (nearest in n) and (len(nearest.split()) > largest): q = '"%s"' % nearest largest = len(nearest.split()) if q == '': q = cleaned q = sanitize(q) # find longest noun phrase containing the nearest # res = None # for np in nps[0]: # if nearest in np and len(np) > longest and len(np) < 5: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest else: try: qtxt = unicodedata.normalize('NFKD', cleaned).encode( 'ascii', 'ignore') except: qtxt = cleaned.encode('ascii', 'ignore') qterms = [qtxt] tokens = self.tokenizer.tokenize(' '.join(qterms)) # tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer) q = ' '.join([ t for t in tokens if (t not in self.stopwords and not (self.all_digits(t))) ]) if self.opts.concept_boost > 0: qconcepts = mmrun(cleaned) qcids = [] for cdata in qconcepts['concepts']: newterms = self.expand_concept(cdata) if newterms is not None: qcids.append(newterms) else: qcids = [] if self.opts.np_boost > 0: nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened') noun_phs = set() for e in nps: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords: tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1 if sub_e.replace( '"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1) if '"' + tmp.replace( '"', '' ) + '"' not in noun_phs and not all_stop: noun_phs.add('"' + tmp.replace('"', '') + '"') else: noun_phs = [] if self.opts.analyzer: r = self.es_int.simple_search( q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: # r = self.es_int.multi_field_search(sentence=q, # concepts=' '.join( # [w for w in qcids]), # noun_phrases=' '.join( # [e for e in noun_phs]), # maxsize=self.opts.maxsize, # source_fields=[ # 'offset', 'sentence', 'mm-concepts', 'noun_phrases'], # doc_type=doc_type, # field_boost=[self.opts.sent_boost, # self.opts.concept_boost, # self.opts.np_boost]) fields = [ 'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed' ] tokens1 = [] for w in self.tokenizer.tokenize(cleaned): Okay = True if self.opts.remove_stopwords: if w in self.stopwords: Okay = False if '-' in w: tokens1.append(self.stemmer.stem(w.replace('-', ''))) if Okay: tokens1.append(self.stemmer.stem(w)) field_vals = [ q, ' '.join([w for w in qcids]), (' '.join([e for e in noun_phs])).replace('"', ''), ' '.join([w for w in tokens1]) ] field_boosts = [ self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost ] r = self.es_int.multi_field_search( field_vals=field_vals, fields=fields, source_fields=['offset', 'sentence'], maxsize=self.opts.maxsize, field_boost=field_boosts, doc_type=doc_type) # r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence']) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'score': 0, 'sentence': [''], 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'remove-stopwords-phrase': {'default': False, 'action': 'store_true'}, 'noun-phrase': {'default': False, 'action': 'store_true'}, 'phrase-slop': {'type': int, 'default': 0}, 'combine': {'default': False, 'action': 'store_true'}, 'docs-path': {'default': DOCS_PATH}, 'expand-window': {'default': False, 'action': 'store_true'}, 'query-terms': {'default': False, 'action': 'store_true'}, 'verbose': {'default': False, 'action': 'store_true'}, 'qterm-weight': {'type': float, 'default': 1.0}, 'phrase-weight': {'type': float, 'default': 2.0}, 'surrounding-words-weight': {'type': float, 'default': 1.0}, 'filter-allstops': {'default': False, 'action': 'store_true'}, 'expand-results': {'type': int, 'default': 0}, 'sentence': {'default': False, 'type': int}, 'analyzer': {'default': False, 'type': str}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() def run(self, test_data): out_results = [] not_found = 0 total = 0 # outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8') processed = set() for ann in test_data: if (ann['topic_id'] + '_' + str(ann['citance_number'])) not in processed: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') doc = self.doc_mod.get_doc( ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0] + 1, [ann['citation_marker_offset'][1] - ann['citation_offset'][1] + 1]] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') chunks = set() # get noun phrases, format [[[term1, term2],[term3]][term4, # term5]] nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] noun_phrases = [e for e in list(itertools.chain.from_iterable(nps)) if e not in self.stopwords] # tokens = self.tokenizer.tokenize(cit_text) # tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) # cleaned = '' # # m = list(self.reg_apa.finditer(cit_text_doc)) # m1 = list(self.reg_ieee.finditer(cit_text_doc)) # m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # # (start, end, group) # if len(m) > 0: # markers = [(e.start(), e.end(), e.group(0)) for e in m] # elif len(m1) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m1] # elif len(m2) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m2] # else: # m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) # if len(m3) > 0: # markers = [(e.start(), e.end(), e.group(0)) # for e in m3] # else: # not_found += 1 # nearest = '' # distance = 100000 # if len(markers) > 1: # # find nearest word to the citation marker # for idx, f in enumerate(tokens_offsets): # # check to see if in valid span (not citation markers) # invalid = False # for e in markers: # if f[0] >= e[0] and f[1] <= e[1]: # invalid = True # if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ # (cit_mrk_offset_sent[0] - f[1] < distance) and\ # not invalid: # distance = cit_mrk_offset_sent[0] - f[1] # if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: # nearest = tokens[idx] # # # find longest noun phrase containing the nearest # longest = 0 # res = None # for np in nps[0]: # if nearest in np and len(np) > longest: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest # else: # # if there is only one citation marker, just consider the # # whole citation text as the query # q_tokens = [] # for idx, f in enumerate(tokens_offsets): # invalid = False # for e in markers: # if f[0] >= e[0] and f[1] <= e[1]: # invalid = True # if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ # (cit_mrk_offset_sent[0] - f[1] < distance) and\ # not invalid: # q_tokens.append(tokens[idx]) # res = ' '.join([f for f in q_tokens]) q = noun_phrases q = ' '.join(q).encode('ascii', 'ignore') # outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) ) tokens = self.es_int.tokenize(q, "sentence") q = ' '.join([t for t in tokens if (t not in self.stopwords and not(self.all_digits(t)))]) if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] beg = e['offset'][0][0] - \ 100 if e['offset'][0][0] else e['offset'][0][0] end = e['offset'][0][1] + 100 e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'thresh': { 'type': int, 'default': False }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': True, 'action': 'store_true' }, 'combine': { 'default': False, 'action': 'store_true' }, 'cache-path': { 'default': 'cache' }, 'idf_index': { 'default': 'pubmed' } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) def run(self, test_data): out_results = [] doc_freq_path = os.path.join( self.opts.cache_path, 'idfidx' + self.opts.idf_index + 'wp_doc_freq.json') if os.path.exists(doc_freq_path): with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf: doc_freq = json.load(mf) else: doc_freq = {} es_int2 = ESAuth(host='devram4.cs.georgetown.edu', index_name=self.opts.idf_index) count_docs = es_int2.count(query='*:*') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') terms = [] for t in self.es_int.tokenize(q, 'sentence'): if (t not in self.stopwords and t not in authors and not (self.all_digits(t))): if t not in doc_freq.keys(): count = es_int2.count(t) if count > 0: idf = log(count_docs / float(count + 1)) doc_freq[t] = idf terms.append(t) else: idf = doc_freq[t] terms.append(t) avg_idf = np.average([doc_freq[t] for t in terms]) thresh = avg_idf if self.opts.thresh is not None\ else self.opts.thresh q = ' '.join([t for t in terms if (doc_freq[t] > thresh)]) if q == '': max_idf = -1 for t in terms: if max_idf < doc_freq[t]: max_idf = doc_freq[t] q = t r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf: json.dump(doc_freq, mf, indent=2) return out_results
def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index)
class Prep(object): def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index) def prep(self, docs_path='../data/TAC_2014_BiomedSumm_Training_Data', json_data_path='../data/v1-2a.json'): data = get_data(docs_path, json_data_path) train_set = {} for tid in data: train_set[tid] = [] # citation number for cit in data[tid]: offsets = [] ref_art = '' for ann in data[tid][cit].values(): for off in ann['ref_offset']: offsets.append(off) query = ann['cit_text'] ref_art = ann['ref_art'] # union of all annotators reference offsets offsets = union(offsets) doc_type = tid.lower() + '_' + ref_art.lower()[:-4] d = self._prep_data(clean(query), doc_type, offsets) train_set[tid].append(d) return train_set def _prep_data(self, query, doc_type, relevant_offsets, save_path=False): ''' Prepares the training data for leaning to rank Fetches the document from elastic_search and returns a x_train, y_train vector Args: query(str) The query that is used to retrieve relevant offsets doc_type(str) Name of the type on elasticsearch index e.g. 'd1409_train_sherr' relevant_offsets(list): list of offsets that are relevant Returns: list of tuples: a list of training data ('query', 'some text', bool (1 if relevant 0 otherwise)) ''' hits = self.es_int.find_all(doc_type=doc_type) x_train = [] y_train = [] queries = [] for hit in hits: label = 0 offset = eval(hit['_source']['offset']) for off in relevant_offsets: if self.get_overlap(offset, off) > 0: label = 1 break x_train.append(hit['_source']['sentence']) y_train.append(label) queries.append(query) # if save_path: # with codecs.open(save_path, 'wb', 'utf-8') as mf: # pickle.dump(zip(x_train, y_train), mf) return zip(queries, x_train, y_train) def get_overlap(self, a, b): return max(0, min(a[1], b[1]) - max(a[0], b[0]))
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = { 'maxsize': { 'type': int, 'default': 100 }, 'stopwords-path': { 'default': STOPWORDS_PATH }, 'remove-stopwords': { 'default': False, 'action': 'store_true' }, 'remove-stopwords-phrase': { 'default': False, 'action': 'store_true' }, 'noun-phrase': { 'default': False, 'action': 'store_true' }, 'phrase-slop': { 'type': int, 'default': 0 }, 'combine': { 'default': False, 'action': 'store_true' }, 'docs-path': { 'default': DOCS_PATH }, 'expand-window': { 'default': False, 'action': 'store_true' }, 'query-terms': { 'default': False, 'action': 'store_true' }, 'verbose': { 'default': False, 'action': 'store_true' }, 'qterm-weight': { 'type': float, 'default': 1.0 }, 'phrase-weight': { 'type': float, 'default': 2.0 }, 'surrounding-words-weight': { 'type': float, 'default': 1.0 }, 'filter-allstops': { 'default': False, 'action': 'store_true' }, 'expand-results': { 'type': int, 'default': 0 }, 'sentence': { 'default': False, 'type': int }, 'analyzer': { 'default': False, 'type': str } } def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() def run(self, test_data): out_results = [] # outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = re.sub(r'( ,)+', ',', q) q = q.encode('ascii', 'ignore') nlp_extractor = Extract_NLP_Tags() nps = nlp_extractor.extract_NP(q, mode='flattened') # outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) ) q1 = '' queryterms = set() for e in nps: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords_phrase: tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1) if tmp not in queryterms and not all_stop: q1 += '"' + tmp + '"^' + \ str(self.opts.phrase_weight) + ' ' queryterms.add(tmp) if self.opts.expand_window: window = self.doc_mod.get_para( ann['topic_id'].lower(), ann['citing_article'][:-4].lower(), (ann['citation_offset'][0], ann['citation_offset'][1])) sorrounding_text = deepcopy(window['sentence']) st = self.regex_citation('', sorrounding_text) st = re.sub(r'( ,)+', ',', st) st = st.encode('ascii', 'ignore') other_nouns = nlp_extractor.extract_NP(st, mode='flattened') for e in other_nouns: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords_phrase: tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join( sub_e.replace('"', '') for sub_e in e1) if tmp not in queryterms and not all_stop: q1 += '"' + tmp + '"^' + \ str(self.opts.surrounding_words_weight) + \ ' ' queryterms.add(tmp) if self.opts.query_terms: q = ' '.join([ t + '^' + str(self.opts.qtrem_weight) for t in self.es_int.tokenize(q) if (t not in self.stopwords and t not in authors and not (self.all_digits(t))) ]) q1 = q1 + ' ' + q if self.opts.verbose: print "query: %s" % q print "q1 : %s" % q1 print '_____' # q2 = self.es_int.tokenize(q1, 'sentence') # q2 = ' '.join([t for t in self.es_int.tokenize(q1) # if (t not in self.stopwords and # t not in authors and # not(self.all_digits(t)))]) if self.opts.analyzer: r = self.es_int.simple_search( q1.strip(), maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, phrase_slop=self.opts.phrase_slop, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search( q1.strip(), maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], # field='sentence', doc_type=doc_type, phrase_slop=self.opts.phrase_slop) if self.opts.sentence: for idx, e in enumerate(deepcopy(r)): if '_id' in e: query = ' OR '.join([ '_id:%s' % (str(int(e['_id']) + j).zfill(5)) for j in range(-1 * self.opts.sentence, self.opts.sentence + 1) if j != 0 and int(e['_id']) + j > 0 ]) sour = self.es_int.simple_search( query, doc_type=e['_type'], maxsize=2 * self.opts.sentence, source_fields=['offset', 'sentence']) # aft = self.es_int.get_page( # str(int(e['_id']) + 1).zfill(5), e['_type']) # bef = self.es_int.get_page( # str(int(e['_id']) + 1).zfill(5), e['_type']) if len(sour) > 0: for s in sour: r.insert(idx + 1, s) for e in r: fld = e.pop('fields') if eval(fld['offset'][0])[0] < self.opts.expand_results: beg = 0 else: beg = eval(fld['offset'][0])[0] - self.opts.expand_results endd = eval(fld['offset'][0])[1] + self.opts.expand_results e['offset'] = [(beg, endd)] e['sentence'] = fld['sentence'][0] e['query'] = q1 r1 = deepcopy(r) r = [] for idx, e in enumerate(r1): if idx < self.opts.maxsize: r.append(e) if self.opts.combine: if len(r) == 0: r = [{ '_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query': q1, '_id': -11 }] r = [{ '_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q1, '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001' }] out_results.append(r) return out_results
class Prep(object): def __init__(self, index='biosum'): self.es_int = ESInterface(index_name=index) def prep(self, docs_path='../data/TAC_2014_BiomedSumm_Training_Data', json_data_path='../data/v1-2a.json'): data = get_data(docs_path, json_data_path) train_set = {} for tid in data: train_set[tid] = [] # citation number for cit in data[tid]: offsets = [] ref_art = '' for ann in data[tid][cit].values(): for off in ann['ref_offset']: offsets.append(off) query = ann['cit_text'] ref_art = ann['ref_art'] # union of all annotators reference offsets offsets = union(offsets) doc_type = tid.lower() + '_' + ref_art.lower()[:-4] d = self._prep_data(clean(query), doc_type, offsets) train_set[tid].append( d) return train_set def _prep_data(self, query, doc_type, relevant_offsets, save_path=False): ''' Prepares the training data for leaning to rank Fetches the document from elastic_search and returns a x_train, y_train vector Args: query(str) The query that is used to retrieve relevant offsets doc_type(str) Name of the type on elasticsearch index e.g. 'd1409_train_sherr' relevant_offsets(list): list of offsets that are relevant Returns: list of tuples: a list of training data ('query', 'some text', bool (1 if relevant 0 otherwise)) ''' hits = self.es_int.find_all(doc_type=doc_type) x_train = [] y_train = [] queries = [] for hit in hits: label = 0 offset = eval(hit['_source']['offset']) for off in relevant_offsets: if self.get_overlap(offset, off) > 0: label = 1 break x_train.append(hit['_source']['sentence']) y_train.append(label) queries.append(query) # if save_path: # with codecs.open(save_path, 'wb', 'utf-8') as mf: # pickle.dump(zip(x_train, y_train), mf) return zip(queries, x_train, y_train) def get_overlap(self, a, b): return max(0, min(a[1], b[1]) - max(a[0], b[0]))
class InsertionRankHelper(object): """docstring for InsertionRankHelper""" def __init__(self, filter_list=None, base_cache=None, cachedir='cache', eshost=None, esport=None, esindex=None, sim_func=CosineSimilarity(), stopwords=None, weighted=False, query_terms_only=False): if not eshost: eshost = 'localhost' if not esport: esport = 9200 if not esindex: esindex = 'pubmed' self.es = ESInterface(host=eshost, port=esport, index_name=esindex) self.cachedir = cachedir self.sim_func = sim_func self.timer = Timer(prefix='[timer]') self.weighted = weighted self.query_terms_only = query_terms_only self.base_cache = base_cache if not stopwords: stopwords = set() self._stopwords = stopwords if filter_list: filter_list = set([e for e in filter_list if e not in stopwords]) self._filter_list = filter_list # calculate figerprint to use as cache comment! finger_text = ' '.join([ w for w in set.union((self._filter_list or set()), self._stopwords) ]) finger_md5 = md5() finger_md5.update(finger_text.encode('utf-8')) self.finger_filter = finger_md5.hexdigest() def _in_filter_list(self, elem): if elem in self._stopwords: return False try: return (elem in self._filter_list) except TypeError: return True @simple_caching() def _get_docs_content_insrank(self, results): for res in results: content = self.es.get_page(res['id'], res['type']) content.pop('references', None) res['content'] = [e[1] for e in content.items()] return results @simple_caching() def _tokenize_and_expand(self, qid, question, results): """ Takes care of tokenizing the question/results and expanding them using one of the four dictionary expansion methods. """ cache_comment = self.base_cache + qid results = self._get_docs_content_insrank(results, cache_comment=cache_comment) docs = { r['id']: unicode(r['content']).replace(':', ' ') for r in results } docs[qid] = question # filters out terms docs = { did: ' '.join([w for w in doc.split() if self._in_filter_list(w)]) for did, doc in docs.items() } return self.exp_method(docs).objout def get_docs(self, qid, question, results): cache_comment = (self.base_cache + '{0}_{1}_{2}'.format( qid, self.exp_method.__name__, self.finger_filter)) docs = self._tokenize_and_expand(qid, question, results, cache_comment=cache_comment) question = Document(qid, 0, docs.pop(qid).split(), float('inf'), 'query', float('inf'), 0) doc_results = [] for res in results: res['tokens'] = docs[res['id']].split() # eliminates tokens that are not part of the # question if specified by argument query_terms_only if self.query_terms_only: res['tokens'] = [ t for t in res['tokens'] if t in question.terms ] doc_results.append( Document(res['id'], res['rank'], res['tokens'], res['relevance'], res['type'], res['score'], res['rank'], weighted=self.weighted)) return question, doc_results def _swap_position(self, pos_list, posA, posB): elA = pos_list[posA] elB = pos_list[posB] elA.rank = posB + 1 elB.rank = posA + 1 pos_list[posB] = elA pos_list[posA] = elB return True def _is_swappable(self, doc, new_rank): shift = int(fabs(doc.original_rank - new_rank)) if shift > self.max_rerank_pos: return False else: return True def rerank(self, qid, question, results, exp_method, max_rerank_pos=None, training_mode=False): """ Performs reranking """ # Retrieves dynamically the methods in expansion_methods # using inspect module. methods = inspect.getmembers(expansion_methods, inspect.isclass) methods = [str(e[1]).split('.') for e in methods] methods = [e[len(e) - 1] for e in methods] # tries to load such method from expansion_methods. If it # fails, it terminates with status 1 try: self.exp_method = getattr(expansion_methods, exp_method) except AttributeError: print >> sys.stderr, ('[error] {m} is not a valid method: ' + 'use {l}.').format(m=exp_method, l=', '.join(methods)) sys.exit(1) # if no maximum number of shifts is set, it lets # results move up/down as much as they want if not (max_rerank_pos): max_rerank_pos = len(results) self.max_rerank_pos = max_rerank_pos # true if at least a pair of elements have been swapped # or if is before first iteration swap_flag = True self.timer('expansion query {qid}'.format(qid=qid), quiet=True) question, docs = self.get_docs(qid, question, results) self.timer('expansion query {qid}'.format(qid=qid), quiet=training_mode) self.timer('reranking {qid}'.format(qid=qid), quiet=True) while swap_flag: swap_flag = False for (i, j) in [(i, i + 1) for i in range(len(docs) - 1)]: sim_i = self.sim_func(question, docs[i]) sim_j = self.sim_func(question, docs[j]) if (sim_j > sim_i and self._is_swappable(docs[i], j + 1) and self._is_swappable(docs[j], i + 1)): self._swap_position(docs, i, j) swap_flag = True self.timer('reranking {qid}'.format(qid=qid), quiet=training_mode) if not training_mode: # calculate and print statistics on # of shifts rankvals = np.array([fabs(d.original_rank - d.rank) for d in docs]) msg = '[info] shift avg: {:.2f}\t shift stdev: {:.2f}' print msg.format(rankvals.mean(), rankvals.std()) out = [{ 'id': d.id, 'score': d.score, 'rank': d.rank, 'relevance': d.relevance, 'original_rank': d.original_rank } for d in sorted(docs, key=lambda o: o.rank)] return out
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'thresh': {'type': int, 'default': False}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': True, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'cache-path': {'default': 'cache'}, 'idf_index': {'default': 'pubmed'}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+" r"(\s?([;,]|and)\s)?)+\))|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) def run(self, test_data): out_results = [] doc_freq_path = os.path.join(self.opts.cache_path, 'idfidx' + self.opts.idf_index + 'wp_doc_freq.json') if os.path.exists(doc_freq_path): with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf: doc_freq = json.load(mf) else: doc_freq = {} es_int2 = ESAuth(host='devram4.cs.georgetown.edu', index_name=self.opts.idf_index) count_docs = es_int2.count(query='*:*') for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') terms = [] for t in self.es_int.tokenize(q, 'sentence'): if (t not in self.stopwords and t not in authors and not(self.all_digits(t))): if t not in doc_freq.keys(): count = es_int2.count(t) if count > 0: idf = log(count_docs / float(count + 1)) doc_freq[t] = idf terms.append(t) else: idf = doc_freq[t] terms.append(t) avg_idf = np.average([doc_freq[t] for t in terms]) thresh = avg_idf if self.opts.thresh is not None\ else self.opts.thresh q = ' '.join([t for t in terms if (doc_freq[t] > thresh)]) if q == '': max_idf = -1 for t in terms: if max_idf < doc_freq[t]: max_idf = doc_freq[t] q = t r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=['offset', 'sentence'], field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf: json.dump(doc_freq, mf, indent=2) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 100}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}, 'concept_boost': {'default': 3, 'type': int}, 'np_boost': {'default': 3, 'type': int}, 'sent_boost': {'default': 1, 'type': int}, 'stem_boost': {'default': 1, 'type': int}, 'runmode': {'default': 'train'}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT')} self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer() # if len(args) > 3:s # self.ttys = [] # # for tty in args[3:]: # if tty in ttygroups: # self.ttys.extend(ttygroups[tty]) # else: # self.ttys.append(tty) def expand_concept(self, cdata, synonyms=False): rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'} Okay = True for st in cdata['SemanticTypes']: if st in rejected_semTypes: Okay = False if Okay: if synonyms: return self.concept_synonyms(cdata['ConceptId']) else: return cdata['ConceptId'] def concept_synonyms(self, cui): if cui in evaluate.cachefile: return set(evaluate.cachefile[cui]) else: termtypes = ("and (TTY=" + " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")") # query = 'select * from (select distinct STR from MRCONSO a,'+\ # '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\ # ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd ;' query = "select STR from MRCONSO where " +\ "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\ termtypes + " and (SAB = 'SNOMEDCT_US')" # print query self.cur.execute(query) # self.cur.execute("select STR from MRCONSO where " + # "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui + # termtypes + " and SAB != 'CHV'") syns = set(filter(lambda y: y.replace(" ", "").isalpha(), [x.lower() for x, in self.cur.fetchall()])) evaluate.cachefile[cui] = list(syns) return syns def run(self, test_data): out_results = [] for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME if self.opts.runmode == 'eval': doc_type = doc_type.replace('train', 'eval') doc = self.doc_mod.get_doc( ann['topic_id'].lower(), ann['citing_article']) cit_text = ann['citation_text'] cit_text_doc = doc[ ann['citation_offset'][0]:ann['citation_offset'][1]] cit_marker = ann['citation_marker'] cit_marker_doc = doc[ ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]] cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0], ann['citation_marker_offset'][1] - ann['citation_offset'][0]] cleaned = self.reg_apa.sub('', cit_text_doc) cleaned = self.reg_ieee.sub('', cleaned) cleaned = self.reg_paranthesis.sub('', cleaned) cleaned = self.reg_apa_rare.sub('', cleaned) cleaned = re.sub('\s+', ' ', cleaned).strip() cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ') ''' -------------- IMMEDIATE NP BEFORE MARKER ---------- ''' m = list(self.reg_apa.finditer(cit_text_doc)) m1 = list(self.reg_ieee.finditer(cit_text_doc)) m2 = list(self.reg_paranthesis.finditer(cit_text_doc)) # (start, end, group) if len(m) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m] elif len(m1) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m1] elif len(m2) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m2] else: m3 = list(self.reg_apa_rare.finditer(cit_text_doc)) if len(m3) > 0: markers = [(e.start(), e.end(), e.group(0)) for e in m3] else: markers = [] if len(markers) > 10000: nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip()) if nps is None: q = cleaned else: t = nps.split(' ') concepts = [] for i in range(len(t)): conc = [] toks = t[i].split('/') while(('NP' in toks[2]) and (i < len(t))): conc.append((toks[0], toks[6])) i += 1 if i < len(t): toks = t[i].split('/') if len(conc) > 0: concepts.append(conc) noun_phrases = [ ' '.join([s1[0] for s1 in t1]) for t1 in concepts] # nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened') # nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps] # nps = [a[1:-1] for sent in nps for piece in sent for a in piece] # for e in nps: # noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords] tokens = self.tokenizer.tokenize(cit_text) tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc) nearest = '' nearest_idx = -1 distance = 100000 # find nearest word to the citation marker for idx, f in enumerate(tokens_offsets): # check to see if in valid span (not citation markers) invalid = False for e in markers: if f[0] >= e[0] and f[1] <= e[1]: invalid = True if (cit_mrk_offset_sent[0] - f[1] >= 0) and\ (cit_mrk_offset_sent[0] - f[1] < distance) and\ not invalid: distance = cit_mrk_offset_sent[0] - f[1] if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0: nearest = tokens[idx] if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0: nearest = tokens[ idx - 1] + ' ' + tokens[idx] nearest_idx = idx elif (cit_mrk_offset_sent[0] < f[1]): break if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\ tokens[nearest_idx] not in stops100: nearest = tokens[idx - 1] + ' ' + tokens[idx] largest = 0 q = '' for n in noun_phrases: if (nearest in n) and (len(nearest.split()) > largest): q = '"%s"' % nearest largest = len(nearest.split()) if q == '': q = cleaned q = sanitize(q) # find longest noun phrase containing the nearest # res = None # for np in nps[0]: # if nearest in np and len(np) > longest and len(np) < 5: # longest = len(np) # res = np # if res is not None: # res = ' '.join([el for el in res]) # else: # res = nearest else: try: qtxt = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore') except: qtxt = cleaned.encode('ascii', 'ignore') qterms = [qtxt] tokens = self.tokenizer.tokenize(' '.join(qterms)) # tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer) q = ' '.join([t for t in tokens if (t not in self.stopwords and not(self.all_digits(t)))]) if self.opts.concept_boost > 0: qconcepts = mmrun(cleaned) qcids = [] for cdata in qconcepts['concepts']: newterms = self.expand_concept(cdata) if newterms is not None: qcids.append(newterms) else: qcids = [] if self.opts.np_boost > 0: nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened') noun_phs = set() for e in nps: for e1 in e: if len(e1) < 4: all_stop = False if self.opts.remove_stopwords: tmp = ' '.join(sub_e.replace('"', '') for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords) else: count = 0 for sub_e in e1: if sub_e.replace('"', '') in self.stopwords: count += 1 if count == len(e1): all_stop = True tmp = ' '.join(sub_e.replace('"', '') for sub_e in e1) if '"' + tmp.replace('"', '') + '"' not in noun_phs and not all_stop: noun_phs.add( '"' + tmp.replace('"', '') + '"') else: noun_phs = [] if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: # r = self.es_int.multi_field_search(sentence=q, # concepts=' '.join( # [w for w in qcids]), # noun_phrases=' '.join( # [e for e in noun_phs]), # maxsize=self.opts.maxsize, # source_fields=[ # 'offset', 'sentence', 'mm-concepts', 'noun_phrases'], # doc_type=doc_type, # field_boost=[self.opts.sent_boost, # self.opts.concept_boost, # self.opts.np_boost]) fields = [ 'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed'] tokens1 = [] for w in self.tokenizer.tokenize(cleaned): Okay = True if self.opts.remove_stopwords: if w in self.stopwords: Okay = False if '-' in w: tokens1.append(self.stemmer.stem(w.replace('-', ''))) if Okay: tokens1.append(self.stemmer.stem(w)) field_vals = [q, ' '.join([w for w in qcids]), (' '.join([e for e in noun_phs])).replace( '"', ''), ' '.join([w for w in tokens1])] field_boosts = [ self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost] r = self.es_int.multi_field_search(field_vals=field_vals, fields=fields, source_fields=[ 'offset', 'sentence'], maxsize=self.opts.maxsize, field_boost=field_boosts, doc_type=doc_type) # r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence']) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'score': 0, 'sentence': [''], 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) return out_results
class Method(MethodInterface): """ Produce reference text by submitting the citance to the ElasticSearch server. """ method_opts = {'maxsize': {'type': int, 'default': 3}, 'stopwords-path': {'default': STOPWORDS_PATH}, 'remove-stopwords': {'default': False, 'action': 'store_true'}, 'combine': {'default': False, 'action': 'store_true'}, 'analyzer': {'default': False, 'type': str}, 'ngram': {'default': False, 'type': int}} def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) def run(self, test_data): # with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf: # json.dump(test_data, mf, indent=2) out_results = [] det_res = {} for ann in test_data: doc_type = '_'.join((ann['topic_id'].lower(), ann['reference_article'][:-4].lower())) # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('train', 'eval') doc_type = doc_type.replace(',', '').replace("'", '"') # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME doc_type = doc_type.replace('eval', 'train') authors = set((ann['reference_article'][:-4].lower().strip(), ann['citing_article'][:-4].lower().strip())) # preprocess (removes citations) and tokenizes # citation text before submitting to elasticsearch q = self.regex_citation('', ann['citation_text']) q = q.encode('ascii', 'ignore') # tokens = self.es_int.tokenize(q, "sentence") tokens = self.tokenizer.tokenize(q) tokens = ['"' + t + '"' if '-' in t else t for t in tokens] q = ' '.join([t for t in tokens if (t not in self.stopwords and t not in authors and not(self.all_digits(t)))]) if self.opts.ngram: tokens = self.es_int.tokenize(q, "sentence") new_query = '' for i in range(len(tokens) - self.opts.ngram): tmp = '' for j in range(i, i + self.opts.ngram): tmp += tokens[j] + ' ' new_query += '"' + tmp.strip() + '" ' q = new_query.strip() # q = '*:*' if self.opts.analyzer: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type, params={'analyzer': self.opts.analyzer}) else: r = self.es_int.simple_search(q, maxsize=self.opts.maxsize, source_fields=[ 'offset', 'sentence'], # field='sentence', doc_type=doc_type) for e in r: fld = e.pop('fields') e['offset'] = [eval(fld['offset'][0])] # beg = e['offset'][0][0] - \ # 100 if e['offset'][0][0] else e['offset'][0][0] # end = e['offset'][0][1] + 100 # e['offset'] = [(beg, end)] e['sentence'] = fld['sentence'][0] e['query'] = q e['topic'] = ann['topic_id'].lower() if self.opts.combine: if len(r) == 0: r = [{'_type': doc_type, '_index': self.opts.index_name, '_score': 0, 'sentence': '', 'offset': [(0, 1)], 'query':q, '_id':-11}] r = [{'_type': r[0]['_type'], '_index': r[0]['_index'], 'query': q, 'topic': ann['topic_id'].lower(), 'citance_number': ann['citance_number'], 'citation_text': ann['citation_text'], 'citing_article': ann['citing_article'], '_score': sum([e['_score'] for e in r]), 'offset': [e['offset'][0] for e in r], 'sentence': [e['sentence'] for e in r], '_id': '-000001'}] out_results.append(r) # with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf: # json.dump(out_results, mf, indent=2) # sys.exit() return out_results