Exemple #1
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
Exemple #2
0
class BM25(Feature):
    def __init__(self, index_name):
        '''
        default constuctor

        Args:
            index_name(str): The elasticsearch index name that will
                be used to retrieve documents and idfs
        '''
        self.es_int = ESInterface(index_name=index_name)
        print self.es_int.get_avg_size('sentence')
        self.avg_doc_length = -1

    def extract(self,
                query,
                document,
                stem=True,
                no_stopwords=True,
                b=0.75,
                k1=1.25):
        '''
        Args:
            query(str)
            document(str)
            stem(bool)
            no_stopwords(bool)
            b(float): Controls to what degree document length normalizes tf values.
            k1(float): Controls non-linear term frequency normalization
        '''
        q_terms = list(
            set([
                w for w in self.tokenize(
                    query, stem=stem, no_stopwords=no_stopwords)
            ]))
        d_terms = list(
            set([
                w for w in self.tokenize(
                    document, stem=stem, no_stopwords=no_stopwords)
            ]))
        d_len = len(self.tokenize(document, stem=False, no_stopwords=False))
        if self.avg_doc_length == -1:
            self.avg_doc_length = self.es_int.get_avg_size('sentence')
        score = 0
        for t in q_terms:
            score += self.es_int.get_idf(t) *\
                ((self._freq(t, d_terms) * (k1 + 1)) /
                 (self._freq(t, d_terms) +
                  k1 * (1 - b + b * (d_len / avg_doc_length))))
        return score

    def _freq(self, term, doc):
        '''
        Gets the frequency of a term in a doc

        Args:
            term(str)
            doc(list(str)) -- list of strings
        '''
        return len([1 for t in doc if t == term])
Exemple #3
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
Exemple #4
0
    def __init__(self, index_name=constants.bm25_index):
        '''
        default constuctor

        Args:
            index_name(str): The elasticsearch index name that will
                be used to retrieve documents and idfs
        '''
        self.es_int = ESInterface(index_name=index_name)
        print self.es_int.get_avg_size('sentence')
        self.avg_doc_length = -1
Exemple #5
0
def dump_stats(dump_stats_data, dump_path, index_name):

    es_int = ESInterface(index_name=index_name)
    csv_line = []

    for ann, res in dump_stats_data:
        csv_line.extend([[
            ann['topic_id'], ann['citing_article'][:-4].lower(),
            ann['reference_article'][:-4].lower(), ann['discourse_facet']
        ], [''], [ann['citation_text'].encode('ascii', 'ignore')], ['']])
        offsets = chain(*[[s[0], s[1], '']
                          for s in sorted(ann['reference_offset'].keys(),
                                          key=lambda t: t[0])])
        csv_line.extend([list(offsets), ['']])
        csv_line.append(['prec:'])
        csv_line.extend([list(t) for t in calculate_ap([res], [ann]).items()])
        csv_line.append(['ndcg:'])
        csv_line.extend(
            [list(t) for t in calculate_ndcg([res], [ann]).items()])
        csv_line.append([''])
        for i, r in enumerate(res, start=1):
            rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper()

            # temp until Arman fixes bug
            txt = es_int.get_page_by_res(r)['sentence'].encode(
                'ascii', 'ignore')
            offset = str(
                es_int.get_page_by_res(r)['offset']).strip('()').split(', ')
            csv_line.extend([[txt],
                             [
                                 'rank', i, '', 'offset', offset[0], offset[1],
                                 '', 'rel?', rel
                             ]])

            # commented until bugs fixed
#             txt = []
#             for offset in r['offset']:
#                 txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]),
#                                           r['_type'].split('_')[2], offset))
#             txt = ' ... '.join(txt)
#             csv_line.extend([[txt], ['rank', i, '', 'offset',
#                                      r['offset'][0][0], r['offset'][0][1],
#                                      '', 'rel?', rel]])
#             csv_line.append([''])
        csv_line.extend([[''], ['']])

    with file(dump_path, 'wb') as csv_file:
        wr = csv.writer(csv_file)
        wr.writerows(csv_line)
Exemple #6
0
def dump_stats(dump_stats_data, dump_path, index_name):

    es_int = ESInterface(index_name=index_name)
    csv_line = []

    for ann, res in dump_stats_data:
        csv_line.extend([[ann['topic_id'],
                          ann['citing_article'][:-4].lower(),
                          ann['reference_article'][:-4].lower(),
                          ann['discourse_facet']], [''],
                         [ann['citation_text'].encode('ascii', 'ignore')],
                         ['']])
        offsets = chain(*[[s[0], s[1], '']
                          for s in sorted(ann['reference_offset'].keys(),
                                          key=lambda t: t[0])])
        csv_line.extend([list(offsets), ['']])
        csv_line.append(['prec:'])
        csv_line.extend([list(t)
                         for t in calculate_ap([res], [ann]).items()])
        csv_line.append(['ndcg:'])
        csv_line.extend([list(t)
                         for t in calculate_ndcg([res], [ann]).items()])
        csv_line.append([''])
        for i, r in enumerate(res, start=1):
            rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper()

            # temp until Arman fixes bug
            txt = es_int.get_page_by_res(
                r)['sentence'].encode('ascii', 'ignore')
            offset = str(es_int.get_page_by_res(r)['offset']).strip(
                '()').split(', ')
            csv_line.extend([[txt], ['rank', i, '', 'offset', offset[0],
                                     offset[1], '', 'rel?', rel]])

            # commented until bugs fixed
#             txt = []
#             for offset in r['offset']:
#                 txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]),
#                                           r['_type'].split('_')[2], offset))
#             txt = ' ... '.join(txt)
#             csv_line.extend([[txt], ['rank', i, '', 'offset',
#                                      r['offset'][0][0], r['offset'][0][1],
#                                      '', 'rel?', rel]])
#             csv_line.append([''])
        csv_line.extend([[''], ['']])

    with file(dump_path, 'wb') as csv_file:
        wr = csv.writer(csv_file)
        wr.writerows(csv_line)
Exemple #7
0
 def __init__(self, args, opts):
     super(Method, self).__init__(args, opts)
     self.es_int = ESInterface(host=self.opts.server,
                               port=self.opts.port,
                               index_name=self.opts.index_name)
     self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                      r"(\s?([;,]|and)\s)?)+\))|"
                                      r"(\[(\d+([,–]\s?)?)+\])|"
                                      r"\[[\d,-]+\]").sub
     self.all_digits = re.compile(r"^\d+$").search
     if self.opts.remove_stopwords:
         with file(self.opts.stopwords_path) as f:
             self.stopwords = frozenset([l.strip().lower() for l in f])
     else:
         self.stopwords = frozenset([])
Exemple #8
0
class BM25(Feature):

    def __init__(self, index_name):
        '''
        default constuctor

        Args:
            index_name(str): The elasticsearch index name that will
                be used to retrieve documents and idfs
        '''
        self.es_int = ESInterface(index_name=index_name)
        print self.es_int.get_avg_size('sentence')
        self.avg_doc_length = -1

    def extract(self, query, document, stem=True, no_stopwords=True, b=0.75, k1=1.25):
        '''
        Args:
            query(str)
            document(str)
            stem(bool)
            no_stopwords(bool)
            b(float): Controls to what degree document length normalizes tf values.
            k1(float): Controls non-linear term frequency normalization
        '''
        q_terms = list(set([w for w in self.tokenize(
            query, stem=stem, no_stopwords=no_stopwords)]))
        d_terms = list(set([w for w in self.tokenize(
            document, stem=stem, no_stopwords=no_stopwords)]))
        d_len = len(self.tokenize(document, stem=False, no_stopwords=False))
        if self.avg_doc_length == -1:
            self.avg_doc_length = self.es_int.get_avg_size('sentence')
        score = 0
        for t in q_terms:
            score += self.es_int.get_idf(t) *\
                ((self._freq(t, d_terms) * (k1 + 1)) /
                 (self._freq(t, d_terms) +
                  k1 * (1 - b + b * (d_len / avg_doc_length))))
        return score

    def _freq(self, term, doc):
        '''
        Gets the frequency of a term in a doc

        Args:
            term(str)
            doc(list(str)) -- list of strings
        '''
        return len([1 for t in doc if t == term])
Exemple #9
0
 def __init__(self, args, opts):
     super(Method, self).__init__(args, opts)
     self.es_int = ESInterface(host=self.opts.server,
                               port=self.opts.port,
                               index_name=self.opts.index_name)
     self.regex_citation = re.compile(
         r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
         r"(\[(\d+([,–]\s?)?)+\])|"
         r"\[[\d,-]+\]").sub
     self.all_digits = re.compile(r"^\d+$").search
     if self.opts.remove_stopwords:
         with file(self.opts.stopwords_path) as f:
             self.stopwords = frozenset([l.strip().lower() for l in f])
     else:
         self.stopwords = frozenset([])
     self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
     self.ann_client = AnnotationsClient()
    def __init__(self,
                 filter_list=None,
                 base_cache=None,
                 cachedir='cache',
                 eshost=None,
                 esport=None,
                 esindex=None,
                 sim_func=CosineSimilarity(),
                 stopwords=None,
                 weighted=False,
                 query_terms_only=False):

        if not eshost:
            eshost = 'localhost'
        if not esport:
            esport = 9200
        if not esindex:
            esindex = 'pubmed'

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)
        self.cachedir = cachedir
        self.sim_func = sim_func
        self.timer = Timer(prefix='[timer]')
        self.weighted = weighted
        self.query_terms_only = query_terms_only
        self.base_cache = base_cache

        if not stopwords:
            stopwords = set()
        self._stopwords = stopwords

        if filter_list:
            filter_list = set([e for e in filter_list if e not in stopwords])
        self._filter_list = filter_list

        # calculate figerprint to use as cache comment!
        finger_text = ' '.join([
            w for w in set.union((self._filter_list or set()), self._stopwords)
        ])
        finger_md5 = md5()
        finger_md5.update(finger_text.encode('utf-8'))
        self.finger_filter = finger_md5.hexdigest()
Exemple #11
0
    def __init__(self, index_name):
        '''
        default constuctor

        Args:
            index_name(str): The elasticsearch index name that will
                be used to retrieve documents and idfs
        '''
        self.es_int = ESInterface(index_name=index_name)
        print self.es_int.get_avg_size('sentence')
        self.avg_doc_length = -1
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
Exemple #14
0
 def __init__(self, args, opts):
     super(Method, self).__init__(args, opts)
     self.es_int = ESInterface(host=self.opts.server,
                               port=self.opts.port,
                               index_name=self.opts.index_name)
     self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                      r"(\s?([;,]|and)\s)?)+\))|"
                                      r"(\[(\d+([,–]\s?)?)+\])|"
                                      r"\[[\d,-]+\]").sub
     self.all_digits = re.compile(r"^\d+$").search
     if self.opts.remove_stopwords:
         with file(self.opts.stopwords_path) as f:
             self.stopwords = frozenset([l.strip().lower() for l in f])
     else:
         self.stopwords = frozenset([])
    def __init__(self,
                 filter_list=None,
                 base_cache=None,
                 cachedir='cache',
                 eshost=None,
                 esport=None,
                 esindex=None,
                 sim_func=CosineSimilarity(),
                 stopwords=None,
                 weighted=False,
                 query_terms_only=False):

        if not eshost:
            eshost = 'localhost'
        if not esport:
            esport = 9200
        if not esindex:
            esindex = 'pubmed'

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)
        self.cachedir = cachedir
        self.sim_func = sim_func
        self.timer = Timer(prefix='[timer]')
        self.weighted = weighted
        self.query_terms_only = query_terms_only
        self.base_cache = base_cache

        if not stopwords:
            stopwords = set()
        self._stopwords = stopwords

        if filter_list:
            filter_list = set([e for e in filter_list if e not in stopwords])
        self._filter_list = filter_list

        # calculate figerprint to use as cache comment!
        finger_text = ' '.join([w
                                for w in set.union((self._filter_list
                                                    or set()),
                                                   self._stopwords)])
        finger_md5 = md5()
        finger_md5.update(finger_text.encode('utf-8'))
        self.finger_filter = finger_md5.hexdigest()
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
Exemple #17
0
    def __init__(self,
                 documents,
                 eshost='localhost',
                 esport=9200,
                 esindex='pubmed21',
                 cachedir='cache'):

        self.cachedir = cachedir
        self.questions = documents
        self.categories = None

        self.added = dict([(qid, []) for qid in self.questions.keys()])
        self.removed = dict([(qid, []) for qid in self.questions.keys()])

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)

        self.tokenquestions = self.tokenize_questions(self.questions.items())
        self.tokquestions = dict([(k, " ".join(v))
                                  for k, v in self.tokenquestions.iteritems()])

        self.run()
class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 3
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'analyzer': {
            'default': False,
            'type': str
        },
        'ngram': {
            'default': False,
            'type': int
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

    def run(self, test_data):
        #         with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf:
        #             json.dump(test_data, mf, indent=2)
        out_results = []
        det_res = {}
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('train', 'eval')
            doc_type = doc_type.replace(',', '').replace("'", '"')

            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('eval', 'train')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            #             tokens = self.es_int.tokenize(q, "sentence")
            tokens = self.tokenizer.tokenize(q)
            tokens = ['"' + t + '"' if '-' in t else t for t in tokens]
            q = ' '.join([
                t for t in tokens
                if (t not in self.stopwords and t not in authors
                    and not (self.all_digits(t)))
            ])

            if self.opts.ngram:
                tokens = self.es_int.tokenize(q, "sentence")
                new_query = ''
                for i in range(len(tokens) - self.opts.ngram):
                    tmp = ''
                    for j in range(i, i + self.opts.ngram):
                        tmp += tokens[j] + ' '
                    new_query += '"' + tmp.strip() + '" '
                q = new_query.strip()
#             q = '*:*'
            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
                e['topic'] = ann['topic_id'].lower()

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)


#         with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf:
#             json.dump(out_results, mf, indent=2)
#         sys.exit()
        return out_results
 def __init__(self, cache_index='cache'):
     self.es_int = ESInterface(index_name=cache_index)
class InsertionRankHelper(object):

    """docstring for InsertionRankHelper"""

    def __init__(self,
                 filter_list=None,
                 base_cache=None,
                 cachedir='cache',
                 eshost=None,
                 esport=None,
                 esindex=None,
                 sim_func=CosineSimilarity(),
                 stopwords=None,
                 weighted=False,
                 query_terms_only=False):

        if not eshost:
            eshost = 'localhost'
        if not esport:
            esport = 9200
        if not esindex:
            esindex = 'pubmed'

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)
        self.cachedir = cachedir
        self.sim_func = sim_func
        self.timer = Timer(prefix='[timer]')
        self.weighted = weighted
        self.query_terms_only = query_terms_only
        self.base_cache = base_cache

        if not stopwords:
            stopwords = set()
        self._stopwords = stopwords

        if filter_list:
            filter_list = set([e for e in filter_list if e not in stopwords])
        self._filter_list = filter_list

        # calculate figerprint to use as cache comment!
        finger_text = ' '.join([w
                                for w in set.union((self._filter_list
                                                    or set()),
                                                   self._stopwords)])
        finger_md5 = md5()
        finger_md5.update(finger_text.encode('utf-8'))
        self.finger_filter = finger_md5.hexdigest()

    def _in_filter_list(self, elem):
        if elem in self._stopwords:
            return False
        try:
            return (elem in self._filter_list)
        except TypeError:
            return True

    @simple_caching()
    def _get_docs_content_insrank(self, results):
        for res in results:
            content = self.es.get_page(res['id'], res['type'])
            content.pop('references', None)
            res['content'] = [e[1] for e in content.items()]

        return results

    @simple_caching()
    def _tokenize_and_expand(self, qid, question, results):
        """ Takes care of tokenizing the question/results and
            expanding them using one of the four dictionary
            expansion methods.
        """
        cache_comment = self.base_cache + qid
        results = self._get_docs_content_insrank(results,
                                                 cache_comment=cache_comment)
        docs = {r['id']: unicode(r['content']).replace(':', ' ')
                for r in results}
        docs[qid] = question

        # filters out terms
        docs = {did: ' '.join([w for w in doc.split()
                               if self._in_filter_list(w)])
                for did, doc in docs.items()}
        return self.exp_method(docs).objout

    def get_docs(self, qid, question, results):

        cache_comment = (self.base_cache +
                         '{0}_{1}_{2}'.format(qid, self.exp_method.__name__,
                                              self.finger_filter))

        docs = self._tokenize_and_expand(qid,
                                         question,
                                         results,
                                         cache_comment=cache_comment)

        question = Document(qid, 0, docs.pop(qid).split(),
                            float('inf'), 'query',
                            float('inf'), 0)

        doc_results = []
        for res in results:
            res['tokens'] = docs[res['id']].split()

            # eliminates tokens that are not part of the
            # question if specified by argument query_terms_only
            if self.query_terms_only:
                res['tokens'] = [t for t in res['tokens']
                                 if t in question.terms]

            doc_results.append(Document(res['id'], res['rank'], res['tokens'],
                                        res['relevance'], res['type'],
                                        res['score'], res['rank'],
                                        weighted=self.weighted))
        return question, doc_results

    def _swap_position(self, pos_list, posA, posB):
        elA = pos_list[posA]
        elB = pos_list[posB]

        elA.rank = posB + 1
        elB.rank = posA + 1

        pos_list[posB] = elA
        pos_list[posA] = elB

        return True

    def _is_swappable(self, doc, new_rank):
        shift = int(fabs(doc.original_rank - new_rank))
        if shift > self.max_rerank_pos:
            return False
        else:
            return True

    def rerank(self, qid, question, results,
               exp_method, max_rerank_pos=None,
               training_mode=False):
        """ Performs reranking """

        # Retrieves dynamically the methods in expansion_methods
        # using inspect module.
        methods = inspect.getmembers(expansion_methods, inspect.isclass)
        methods = [str(e[1]).split('.') for e in methods]
        methods = [e[len(e) - 1] for e in methods]

        # tries to load such method from expansion_methods. If it
        # fails, it terminates with status 1
        try:
            self.exp_method = getattr(expansion_methods, exp_method)
        except AttributeError:
            print >> sys.stderr, ('[error] {m} is not a valid method: ' +
                                  'use {l}.').format(m=exp_method,
                                                     l=', '.join(methods))
            sys.exit(1)

        # if no maximum number of shifts is set, it lets
        # results move up/down as much as they want
        if not(max_rerank_pos):
            max_rerank_pos = len(results)
        self.max_rerank_pos = max_rerank_pos

        # true if at least a pair of elements have been swapped
        # or if is before first iteration
        swap_flag = True

        self.timer('expansion query {qid}'.format(qid=qid), quiet=True)
        question, docs = self.get_docs(qid, question, results)
        self.timer('expansion query {qid}'.format(qid=qid),
                   quiet=training_mode)

        self.timer('reranking {qid}'.format(qid=qid), quiet=True)
        while swap_flag:
            swap_flag = False
            for (i, j) in [(i, i + 1) for i in range(len(docs) - 1)]:
                sim_i = self.sim_func(question, docs[i])
                sim_j = self.sim_func(question, docs[j])

                if (sim_j > sim_i and
                        self._is_swappable(docs[i], j + 1) and
                        self._is_swappable(docs[j], i + 1)):
                    self._swap_position(docs, i, j)
                    swap_flag = True

        self.timer('reranking {qid}'.format(qid=qid), quiet=training_mode)

        if not training_mode:
            # calculate and print statistics on # of shifts
            rankvals = np.array([fabs(d.original_rank - d.rank)
                                 for d in docs])
            msg = '[info] shift avg: {:.2f}\t shift stdev: {:.2f}'
            print msg.format(rankvals.mean(), rankvals.std())

        out = [{'id': d.id,
                'score': d.score,
                'rank': d.rank,
                'relevance': d.relevance,
                'original_rank': d.original_rank}
               for d in sorted(docs, key=lambda o: o.rank)]

        return out
Exemple #21
0
class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'analyzer': {
            'default': False,
            'type': str
        },
        'ngram': {
            'default': False,
            'type': int
        },
        'concept_boost': {
            'default': 3,
            'type': int
        },
        'np_boost': {
            'default': 3,
            'type': int
        },
        'sent_boost': {
            'default': 1,
            'type': int
        },
        'stem_boost': {
            'default': 1,
            'type': int
        },
        'runmode': {
            'default': 'train'
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata, synonyms=False):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            if synonyms:
                return self.concept_synonyms(cdata['ConceptId'])
            else:
                return cdata['ConceptId']

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x
                                          for x in self.ttys]) + ")")
            #         query = 'select * from (select distinct STR from MRCONSO a,'+\
            #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
            #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
            #             print query
            self.cur.execute(query)

            #         self.cur.execute("select STR from MRCONSO where " +
            #                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
            #                          termtypes + " and SAB != 'CHV'")

            syns = set(
                filter(lambda y: y.replace(" ", "").isalpha(),
                       [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(ann['topic_id'].lower(),
                                       ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[ann['citation_marker_offset'][0]:
                                 ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [
                ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                ann['citation_marker_offset'][1] - ann['citation_offset'][0]
            ]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0)) for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while (('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts
                    ]

                    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
                    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
                    #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
                    #                 for e in nps:
                    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$",
                                              tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(
                                        re.findall(r"^[^A-Za-z]+$",
                                                   tokens[idx - 1])) == 0:
                                    nearest = tokens[idx -
                                                     1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD', cleaned).encode(
                        'ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                tokens = self.tokenizer.tokenize(' '.join(qterms))
                #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([
                    t for t in tokens
                    if (t not in self.stopwords and not (self.all_digits(t)))
                ])
                if self.opts.concept_boost > 0:

                    qconcepts = mmrun(cleaned)
                    qcids = []
                    for cdata in qconcepts['concepts']:
                        newterms = self.expand_concept(cdata)
                        if newterms is not None:
                            qcids.append(newterms)
                else:
                    qcids = []
                if self.opts.np_boost > 0:
                    nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened')
                    noun_phs = set()
                    for e in nps:
                        for e1 in e:
                            if len(e1) < 4:
                                all_stop = False
                                if self.opts.remove_stopwords:
                                    tmp = ' '.join(
                                        sub_e.replace('"', '')
                                        for sub_e in e1 if sub_e.replace(
                                            '"', '') not in self.stopwords)
                                else:
                                    count = 0
                                    for sub_e in e1:
                                        if sub_e.replace('"',
                                                         '') in self.stopwords:
                                            count += 1
                                    if count == len(e1):
                                        all_stop = True
                                    tmp = ' '.join(
                                        sub_e.replace('"', '') for sub_e in e1)
                                if '"' + tmp.replace(
                                        '"', ''
                                ) + '"' not in noun_phs and not all_stop:
                                    noun_phs.add('"' + tmp.replace('"', '') +
                                                 '"')
                else:
                    noun_phs = []

            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q,
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    params={'analyzer': self.opts.analyzer})
            else:
                #                 r = self.es_int.multi_field_search(sentence=q,
                #                                                    concepts=' '.join(
                #                                                        [w for w in qcids]),
                #                                                    noun_phrases=' '.join(
                #                                                        [e for e in noun_phs]),
                #                                                    maxsize=self.opts.maxsize,
                #                                                    source_fields=[
                #                                                        'offset', 'sentence', 'mm-concepts', 'noun_phrases'],
                #                                                    doc_type=doc_type,
                #                                                    field_boost=[self.opts.sent_boost,
                #                                                                 self.opts.concept_boost,
                # self.opts.np_boost])
                fields = [
                    'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed'
                ]
                tokens1 = []
                for w in self.tokenizer.tokenize(cleaned):
                    Okay = True
                    if self.opts.remove_stopwords:
                        if w in self.stopwords:
                            Okay = False
                    if '-' in w:
                        tokens1.append(self.stemmer.stem(w.replace('-', '')))
                    if Okay:
                        tokens1.append(self.stemmer.stem(w))
                field_vals = [
                    q, ' '.join([w for w in qcids]),
                    (' '.join([e for e in noun_phs])).replace('"', ''),
                    ' '.join([w for w in tokens1])
                ]
                field_boosts = [
                    self.opts.sent_boost, self.opts.concept_boost,
                    self.opts.np_boost, self.opts.stem_boost
                ]
                r = self.es_int.multi_field_search(
                    field_vals=field_vals,
                    fields=fields,
                    source_fields=['offset', 'sentence'],
                    maxsize=self.opts.maxsize,
                    field_boost=field_boosts,
                    doc_type=doc_type)
#             r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence'])
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'score': 0,
                        'sentence': [''],
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]

            out_results.append(r)
        return out_results
Exemple #22
0
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'remove-stopwords-phrase': {'default': False,
                                               'action': 'store_true'},
                   'noun-phrase': {'default': False,
                                   'action': 'store_true'},
                   'phrase-slop': {'type': int, 'default': 0},
                   'combine': {'default': False, 'action': 'store_true'},
                   'docs-path': {'default': DOCS_PATH},
                   'expand-window': {'default': False, 'action': 'store_true'},
                   'query-terms': {'default': False, 'action': 'store_true'},
                   'verbose': {'default': False, 'action': 'store_true'},
                   'qterm-weight': {'type': float, 'default': 1.0},
                   'phrase-weight': {'type': float, 'default': 2.0},
                   'surrounding-words-weight': {'type': float, 'default': 1.0},
                   'filter-allstops': {'default': False, 'action': 'store_true'},
                   'expand-results': {'type': int, 'default': 0},
                   'sentence': {'default': False, 'type': int},
                   'analyzer': {'default': False, 'type': str}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()

    def run(self, test_data):
        out_results = []
        not_found = 0
        total = 0
#         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        processed = set()
        for ann in test_data:
            if (ann['topic_id'] + '_' + str(ann['citance_number'])) not in processed:
                doc_type = '_'.join((ann['topic_id'].lower(),
                                     ann['reference_article'][:-4].lower()))
                doc_type = doc_type.replace(',', '').replace("'", '"')
                doc = self.doc_mod.get_doc(
                    ann['topic_id'].lower(), ann['citing_article'])
                cit_text = ann['citation_text']
                cit_text_doc = doc[
                    ann['citation_offset'][0]:ann['citation_offset'][1]]
                cit_marker = ann['citation_marker']
                cit_marker_doc = doc[
                    ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
                cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0] + 1,
                                       [ann['citation_marker_offset'][1] - ann['citation_offset'][1] + 1]]
                cleaned = self.reg_apa.sub('', cit_text_doc)
                cleaned = self.reg_ieee.sub('', cleaned)
                cleaned = self.reg_paranthesis.sub('', cleaned)
                cleaned = self.reg_apa_rare.sub('', cleaned)
                cleaned = re.sub('\s+', ' ', cleaned).strip()
                cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')
                chunks = set()
                # get noun phrases, format [[[term1, term2],[term3]][term4,
                # term5]]
                nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
#                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
#                 for e in nps:
#                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                noun_phrases = [e for e in list(itertools.chain.from_iterable(nps))
                                if e not in self.stopwords]
#                 tokens = self.tokenizer.tokenize(cit_text)
#                 tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
#                 cleaned = ''
#
#                 m = list(self.reg_apa.finditer(cit_text_doc))
#                 m1 = list(self.reg_ieee.finditer(cit_text_doc))
#                 m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
#                 # (start, end, group)
#                 if len(m) > 0:
#                     markers = [(e.start(), e.end(), e.group(0)) for e in m]
#                 elif len(m1) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m1]
#                 elif len(m2) > 0:
#                     markers = [(e.start(), e.end(), e.group(0))
#                                for e in m2]
#                 else:
#                     m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
#                     if len(m3) > 0:
#                         markers = [(e.start(), e.end(), e.group(0))
#                                    for e in m3]
#                     else:
#                         not_found += 1
#                 nearest = ''
#                 distance = 100000
#                 if len(markers) > 1:
#                     # find nearest word to the citation marker
#                     for idx, f in enumerate(tokens_offsets):
#                         # check to see if in valid span (not citation markers)
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             distance = cit_mrk_offset_sent[0] - f[1]
#                             if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
#                                 nearest = tokens[idx]
#
#                         # find longest noun phrase containing the nearest
#                         longest = 0
#                         res = None
#                         for np in nps[0]:
#                             if nearest in np and len(np) > longest:
#                                 longest = len(np)
#                                 res = np
#                         if res is not None:
#                             res = ' '.join([el for el in res])
#                         else:
#                             res = nearest
#                 else:
#                     # if there is only one citation marker, just consider the
#                     # whole citation text as the query
#                     q_tokens = []
#                     for idx, f in enumerate(tokens_offsets):
#                         invalid = False
#                         for e in markers:
#                             if f[0] >= e[0] and f[1] <= e[1]:
#                                 invalid = True
#                         if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
#                                 (cit_mrk_offset_sent[0] - f[1] < distance) and\
#                                 not invalid:
#                             q_tokens.append(tokens[idx])
#                     res = ' '.join([f for f in q_tokens])
                q = noun_phrases
                q = ' '.join(q).encode('ascii', 'ignore')
    #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
                tokens = self.es_int.tokenize(q, "sentence")
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.analyzer:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type,
                                                  params={'analyzer': self.opts.analyzer})
                else:
                    r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                                  source_fields=[
                                                      'offset', 'sentence'],
                                                  # field='sentence',
                                                  doc_type=doc_type)
                for e in r:
                    fld = e.pop('fields')
                    e['offset'] = [eval(fld['offset'][0])]
                    beg = e['offset'][0][0] - \
                        100 if e['offset'][0][0] else e['offset'][0][0]
                    end = e['offset'][0][1] + 100
                    e['offset'] = [(beg, end)]
                    e['sentence'] = fld['sentence'][0]
                    e['query'] = q
                if self.opts.combine:
                    if len(r) == 0:
                        r = [{'_type': doc_type,
                              '_index': self.opts.index_name,
                              '_score': 0,
                              'sentence': '',
                              'offset': [(0, 1)],
                              'query':q, '_id':-11}]
                    r = [{'_type': r[0]['_type'],
                          '_index': r[0]['_index'],
                          'query': q,
                          'topic': ann['topic_id'].lower(),
                          'citance_number': ann['citance_number'],
                          'citation_text': ann['citation_text'],
                          'citing_article': ann['citing_article'],
                          '_score': sum([e['_score'] for e in r]),
                          'offset': [e['offset'][0] for e in r],
                          'sentence': [e['sentence'] for e in r],
                          '_id': '-000001'}]
                out_results.append(r)
        return out_results
Exemple #23
0
class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'thresh': {
            'type': int,
            'default': False
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': True,
            'action': 'store_true'
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'cache-path': {
            'default': 'cache'
        },
        'idf_index': {
            'default': 'pubmed'
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                         r"(\s?([;,]|and)\s)?)+\))|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])

    def run(self, test_data):
        out_results = []
        doc_freq_path = os.path.join(
            self.opts.cache_path,
            'idfidx' + self.opts.idf_index + 'wp_doc_freq.json')
        if os.path.exists(doc_freq_path):
            with codecs.open(doc_freq_path, 'rb', 'UTF-8') as mf:
                doc_freq = json.load(mf)
        else:
            doc_freq = {}
        es_int2 = ESAuth(host='devram4.cs.georgetown.edu',
                         index_name=self.opts.idf_index)
        count_docs = es_int2.count(query='*:*')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            terms = []
            for t in self.es_int.tokenize(q, 'sentence'):
                if (t not in self.stopwords and t not in authors
                        and not (self.all_digits(t))):
                    if t not in doc_freq.keys():
                        count = es_int2.count(t)
                        if count > 0:
                            idf = log(count_docs / float(count + 1))
                            doc_freq[t] = idf
                            terms.append(t)
                    else:
                        idf = doc_freq[t]
                        terms.append(t)
            avg_idf = np.average([doc_freq[t] for t in terms])
            thresh = avg_idf if self.opts.thresh is not None\
                else self.opts.thresh
            q = ' '.join([t for t in terms if (doc_freq[t] > thresh)])
            if q == '':
                max_idf = -1
                for t in terms:
                    if max_idf < doc_freq[t]:
                        max_idf = doc_freq[t]
                        q = t
            r = self.es_int.simple_search(q,
                                          maxsize=self.opts.maxsize,
                                          source_fields=['offset', 'sentence'],
                                          field='sentence',
                                          doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
                #                 beg = e['offset'][0][0] - \
                #                     100 if e['offset'][0][0] else e['offset'][0][0]
                #                 end = e['offset'][0][1] + 100
                #                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q,
                    'topic': ann['topic_id'].lower(),
                    'citance_number': ann['citance_number'],
                    'citation_text': ann['citation_text'],
                    'citing_article': ann['citing_article'],
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        with codecs.open(doc_freq_path, 'wb', 'UTF-8') as mf:
            json.dump(doc_freq, mf, indent=2)
        return out_results
Exemple #24
0
 def __init__(self, index='biosum'):
     self.es_int = ESInterface(index_name=index)
Exemple #25
0
class Prep(object):
    def __init__(self, index='biosum'):
        self.es_int = ESInterface(index_name=index)

    def prep(self,
             docs_path='../data/TAC_2014_BiomedSumm_Training_Data',
             json_data_path='../data/v1-2a.json'):
        data = get_data(docs_path, json_data_path)
        train_set = {}
        for tid in data:
            train_set[tid] = []
            # citation number
            for cit in data[tid]:
                offsets = []
                ref_art = ''
                for ann in data[tid][cit].values():
                    for off in ann['ref_offset']:
                        offsets.append(off)
                    query = ann['cit_text']
                    ref_art = ann['ref_art']
                # union of all annotators reference offsets
                offsets = union(offsets)
                doc_type = tid.lower() + '_' + ref_art.lower()[:-4]
                d = self._prep_data(clean(query), doc_type, offsets)
                train_set[tid].append(d)
        return train_set

    def _prep_data(self, query, doc_type, relevant_offsets, save_path=False):
        '''
        Prepares the training data for leaning to rank
        Fetches the document from elastic_search and 
            returns a x_train, y_train vector


        Args:
            query(str) The query that is used to retrieve relevant offsets
            doc_type(str) Name of the type on elasticsearch index
                e.g. 'd1409_train_sherr'
            relevant_offsets(list): list of offsets that are relevant

        Returns:
            list of tuples: a list of training data
            ('query', 'some text', bool (1 if relevant 0 otherwise))
        '''
        hits = self.es_int.find_all(doc_type=doc_type)
        x_train = []
        y_train = []
        queries = []
        for hit in hits:
            label = 0
            offset = eval(hit['_source']['offset'])
            for off in relevant_offsets:
                if self.get_overlap(offset, off) > 0:
                    label = 1
                    break
            x_train.append(hit['_source']['sentence'])
            y_train.append(label)
            queries.append(query)


#         if save_path:
#             with codecs.open(save_path, 'wb', 'utf-8') as mf:
#                 pickle.dump(zip(x_train, y_train), mf)
        return zip(queries, x_train, y_train)

    def get_overlap(self, a, b):
        return max(0, min(a[1], b[1]) - max(a[0], b[0]))
Exemple #26
0
class Method(MethodInterface):
    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {
        'maxsize': {
            'type': int,
            'default': 100
        },
        'stopwords-path': {
            'default': STOPWORDS_PATH
        },
        'remove-stopwords': {
            'default': False,
            'action': 'store_true'
        },
        'remove-stopwords-phrase': {
            'default': False,
            'action': 'store_true'
        },
        'noun-phrase': {
            'default': False,
            'action': 'store_true'
        },
        'phrase-slop': {
            'type': int,
            'default': 0
        },
        'combine': {
            'default': False,
            'action': 'store_true'
        },
        'docs-path': {
            'default': DOCS_PATH
        },
        'expand-window': {
            'default': False,
            'action': 'store_true'
        },
        'query-terms': {
            'default': False,
            'action': 'store_true'
        },
        'verbose': {
            'default': False,
            'action': 'store_true'
        },
        'qterm-weight': {
            'type': float,
            'default': 1.0
        },
        'phrase-weight': {
            'type': float,
            'default': 2.0
        },
        'surrounding-words-weight': {
            'type': float,
            'default': 1.0
        },
        'filter-allstops': {
            'default': False,
            'action': 'store_true'
        },
        'expand-results': {
            'type': int,
            'default': 0
        },
        'sentence': {
            'default': False,
            'type': int
        },
        'analyzer': {
            'default': False,
            'type': str
        }
    }

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

    def run(self, test_data):
        out_results = []
        #         outfile = codecs.open('tmp/nlp.txt' , 'wb' , 'UTF-8')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = re.sub(r'( ,)+', ',', q)
            q = q.encode('ascii', 'ignore')
            nlp_extractor = Extract_NLP_Tags()
            nps = nlp_extractor.extract_NP(q, mode='flattened')
            #             outfile.write('query: "%s" \nparsed: "%s"\n\n' %(q,str(nps)) )
            q1 = ''
            queryterms = set()
            for e in nps:
                for e1 in e:
                    if len(e1) < 4:
                        all_stop = False
                        if self.opts.remove_stopwords_phrase:
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1 if
                                sub_e.replace('"', '') not in self.stopwords)
                        else:
                            count = 0
                            for sub_e in e1:
                                if sub_e.replace('"', '') in self.stopwords:
                                    count += 1
                            if count == len(e1):
                                all_stop = True
                            tmp = ' '.join(
                                sub_e.replace('"', '') for sub_e in e1)
                        if tmp not in queryterms and not all_stop:
                            q1 += '"' + tmp + '"^' + \
                                str(self.opts.phrase_weight) + ' '
                            queryterms.add(tmp)
            if self.opts.expand_window:
                window = self.doc_mod.get_para(
                    ann['topic_id'].lower(),
                    ann['citing_article'][:-4].lower(),
                    (ann['citation_offset'][0], ann['citation_offset'][1]))
                sorrounding_text = deepcopy(window['sentence'])
                st = self.regex_citation('', sorrounding_text)
                st = re.sub(r'( ,)+', ',', st)
                st = st.encode('ascii', 'ignore')
                other_nouns = nlp_extractor.extract_NP(st, mode='flattened')
                for e in other_nouns:
                    for e1 in e:
                        if len(e1) < 4:
                            all_stop = False
                            if self.opts.remove_stopwords_phrase:
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1
                                    if sub_e.replace('"', '') not in
                                    self.stopwords)
                            else:
                                count = 0
                                for sub_e in e1:
                                    if sub_e.replace('"',
                                                     '') in self.stopwords:
                                        count += 1
                                if count == len(e1):
                                    all_stop = True
                                tmp = ' '.join(
                                    sub_e.replace('"', '') for sub_e in e1)
                            if tmp not in queryterms and not all_stop:
                                q1 += '"' + tmp + '"^' + \
                                    str(self.opts.surrounding_words_weight) + \
                                    ' '
                                queryterms.add(tmp)
            if self.opts.query_terms:
                q = ' '.join([
                    t + '^' + str(self.opts.qtrem_weight)
                    for t in self.es_int.tokenize(q)
                    if (t not in self.stopwords and t not in authors
                        and not (self.all_digits(t)))
                ])
                q1 = q1 + ' ' + q
            if self.opts.verbose:
                print "query:   %s" % q
                print "q1   :       %s" % q1
                print '_____'
#             q2 = self.es_int.tokenize(q1, 'sentence')
#             q2 = ' '.join([t for t in self.es_int.tokenize(q1)
#                           if (t not in self.stopwords and
#                               t not in authors and
#                               not(self.all_digits(t)))])
            if self.opts.analyzer:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop,
                    params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(
                    q1.strip(),
                    maxsize=self.opts.maxsize,
                    source_fields=['offset', 'sentence'],
                    # field='sentence',
                    doc_type=doc_type,
                    phrase_slop=self.opts.phrase_slop)
            if self.opts.sentence:
                for idx, e in enumerate(deepcopy(r)):
                    if '_id' in e:
                        query = ' OR '.join([
                            '_id:%s' % (str(int(e['_id']) + j).zfill(5))
                            for j in range(-1 * self.opts.sentence,
                                           self.opts.sentence + 1)
                            if j != 0 and int(e['_id']) + j > 0
                        ])
                        sour = self.es_int.simple_search(
                            query,
                            doc_type=e['_type'],
                            maxsize=2 * self.opts.sentence,
                            source_fields=['offset', 'sentence'])
                        #                         aft = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        #                         bef = self.es_int.get_page(
                        #                             str(int(e['_id']) + 1).zfill(5), e['_type'])
                        if len(sour) > 0:
                            for s in sour:
                                r.insert(idx + 1, s)

            for e in r:
                fld = e.pop('fields')
                if eval(fld['offset'][0])[0] < self.opts.expand_results:
                    beg = 0
                else:
                    beg = eval(fld['offset'][0])[0] - self.opts.expand_results
                endd = eval(fld['offset'][0])[1] + self.opts.expand_results
                e['offset'] = [(beg, endd)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q1

            r1 = deepcopy(r)
            r = []
            for idx, e in enumerate(r1):
                if idx < self.opts.maxsize:
                    r.append(e)

            if self.opts.combine:
                if len(r) == 0:
                    r = [{
                        '_type': doc_type,
                        '_index': self.opts.index_name,
                        '_score': 0,
                        'sentence': '',
                        'offset': [(0, 1)],
                        'query': q1,
                        '_id': -11
                    }]
                r = [{
                    '_type': r[0]['_type'],
                    '_index': r[0]['_index'],
                    'query': q1,
                    '_score': sum([e['_score'] for e in r]),
                    'offset': [e['offset'][0] for e in r],
                    'sentence': [e['sentence'] for e in r],
                    '_id': '-000001'
                }]
            out_results.append(r)
        return out_results
Exemple #27
0
class Prep(object):

    def __init__(self, index='biosum'):
        self.es_int = ESInterface(index_name=index)

    def prep(self,
             docs_path='../data/TAC_2014_BiomedSumm_Training_Data',
             json_data_path='../data/v1-2a.json'):
        data = get_data(docs_path, json_data_path)
        train_set = {}
        for tid in data:
            train_set[tid] = []
            # citation number
            for cit in data[tid]:
                offsets = []
                ref_art = ''
                for ann in data[tid][cit].values():
                    for off in ann['ref_offset']:
                        offsets.append(off)
                    query = ann['cit_text']
                    ref_art = ann['ref_art']
                # union of all annotators reference offsets
                offsets = union(offsets)
                doc_type = tid.lower() + '_' + ref_art.lower()[:-4]
                d = self._prep_data(clean(query), doc_type, offsets)
                train_set[tid].append(
                    d)
        return train_set

    def _prep_data(self, query, doc_type, relevant_offsets, save_path=False):
        '''
        Prepares the training data for leaning to rank
        Fetches the document from elastic_search and 
            returns a x_train, y_train vector


        Args:
            query(str) The query that is used to retrieve relevant offsets
            doc_type(str) Name of the type on elasticsearch index
                e.g. 'd1409_train_sherr'
            relevant_offsets(list): list of offsets that are relevant

        Returns:
            list of tuples: a list of training data
            ('query', 'some text', bool (1 if relevant 0 otherwise))
        '''
        hits = self.es_int.find_all(doc_type=doc_type)
        x_train = []
        y_train = []
        queries = []
        for hit in hits:
            label = 0
            offset = eval(hit['_source']['offset'])
            for off in relevant_offsets:
                if self.get_overlap(offset, off) > 0:
                    label = 1
                    break
            x_train.append(hit['_source']['sentence'])
            y_train.append(label)
            queries.append(query)
#         if save_path:
#             with codecs.open(save_path, 'wb', 'utf-8') as mf:
#                 pickle.dump(zip(x_train, y_train), mf)
        return zip(queries, x_train, y_train)

    def get_overlap(self, a, b):
        return max(0, min(a[1], b[1]) - max(a[0], b[0]))
class InsertionRankHelper(object):
    """docstring for InsertionRankHelper"""
    def __init__(self,
                 filter_list=None,
                 base_cache=None,
                 cachedir='cache',
                 eshost=None,
                 esport=None,
                 esindex=None,
                 sim_func=CosineSimilarity(),
                 stopwords=None,
                 weighted=False,
                 query_terms_only=False):

        if not eshost:
            eshost = 'localhost'
        if not esport:
            esport = 9200
        if not esindex:
            esindex = 'pubmed'

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)
        self.cachedir = cachedir
        self.sim_func = sim_func
        self.timer = Timer(prefix='[timer]')
        self.weighted = weighted
        self.query_terms_only = query_terms_only
        self.base_cache = base_cache

        if not stopwords:
            stopwords = set()
        self._stopwords = stopwords

        if filter_list:
            filter_list = set([e for e in filter_list if e not in stopwords])
        self._filter_list = filter_list

        # calculate figerprint to use as cache comment!
        finger_text = ' '.join([
            w for w in set.union((self._filter_list or set()), self._stopwords)
        ])
        finger_md5 = md5()
        finger_md5.update(finger_text.encode('utf-8'))
        self.finger_filter = finger_md5.hexdigest()

    def _in_filter_list(self, elem):
        if elem in self._stopwords:
            return False
        try:
            return (elem in self._filter_list)
        except TypeError:
            return True

    @simple_caching()
    def _get_docs_content_insrank(self, results):
        for res in results:
            content = self.es.get_page(res['id'], res['type'])
            content.pop('references', None)
            res['content'] = [e[1] for e in content.items()]

        return results

    @simple_caching()
    def _tokenize_and_expand(self, qid, question, results):
        """ Takes care of tokenizing the question/results and
            expanding them using one of the four dictionary
            expansion methods.
        """
        cache_comment = self.base_cache + qid
        results = self._get_docs_content_insrank(results,
                                                 cache_comment=cache_comment)
        docs = {
            r['id']: unicode(r['content']).replace(':', ' ')
            for r in results
        }
        docs[qid] = question

        # filters out terms
        docs = {
            did: ' '.join([w for w in doc.split() if self._in_filter_list(w)])
            for did, doc in docs.items()
        }
        return self.exp_method(docs).objout

    def get_docs(self, qid, question, results):

        cache_comment = (self.base_cache + '{0}_{1}_{2}'.format(
            qid, self.exp_method.__name__, self.finger_filter))

        docs = self._tokenize_and_expand(qid,
                                         question,
                                         results,
                                         cache_comment=cache_comment)

        question = Document(qid, 0,
                            docs.pop(qid).split(), float('inf'), 'query',
                            float('inf'), 0)

        doc_results = []
        for res in results:
            res['tokens'] = docs[res['id']].split()

            # eliminates tokens that are not part of the
            # question if specified by argument query_terms_only
            if self.query_terms_only:
                res['tokens'] = [
                    t for t in res['tokens'] if t in question.terms
                ]

            doc_results.append(
                Document(res['id'],
                         res['rank'],
                         res['tokens'],
                         res['relevance'],
                         res['type'],
                         res['score'],
                         res['rank'],
                         weighted=self.weighted))
        return question, doc_results

    def _swap_position(self, pos_list, posA, posB):
        elA = pos_list[posA]
        elB = pos_list[posB]

        elA.rank = posB + 1
        elB.rank = posA + 1

        pos_list[posB] = elA
        pos_list[posA] = elB

        return True

    def _is_swappable(self, doc, new_rank):
        shift = int(fabs(doc.original_rank - new_rank))
        if shift > self.max_rerank_pos:
            return False
        else:
            return True

    def rerank(self,
               qid,
               question,
               results,
               exp_method,
               max_rerank_pos=None,
               training_mode=False):
        """ Performs reranking """

        # Retrieves dynamically the methods in expansion_methods
        # using inspect module.
        methods = inspect.getmembers(expansion_methods, inspect.isclass)
        methods = [str(e[1]).split('.') for e in methods]
        methods = [e[len(e) - 1] for e in methods]

        # tries to load such method from expansion_methods. If it
        # fails, it terminates with status 1
        try:
            self.exp_method = getattr(expansion_methods, exp_method)
        except AttributeError:
            print >> sys.stderr, ('[error] {m} is not a valid method: ' +
                                  'use {l}.').format(m=exp_method,
                                                     l=', '.join(methods))
            sys.exit(1)

        # if no maximum number of shifts is set, it lets
        # results move up/down as much as they want
        if not (max_rerank_pos):
            max_rerank_pos = len(results)
        self.max_rerank_pos = max_rerank_pos

        # true if at least a pair of elements have been swapped
        # or if is before first iteration
        swap_flag = True

        self.timer('expansion query {qid}'.format(qid=qid), quiet=True)
        question, docs = self.get_docs(qid, question, results)
        self.timer('expansion query {qid}'.format(qid=qid),
                   quiet=training_mode)

        self.timer('reranking {qid}'.format(qid=qid), quiet=True)
        while swap_flag:
            swap_flag = False
            for (i, j) in [(i, i + 1) for i in range(len(docs) - 1)]:
                sim_i = self.sim_func(question, docs[i])
                sim_j = self.sim_func(question, docs[j])

                if (sim_j > sim_i and self._is_swappable(docs[i], j + 1)
                        and self._is_swappable(docs[j], i + 1)):
                    self._swap_position(docs, i, j)
                    swap_flag = True

        self.timer('reranking {qid}'.format(qid=qid), quiet=training_mode)

        if not training_mode:
            # calculate and print statistics on # of shifts
            rankvals = np.array([fabs(d.original_rank - d.rank) for d in docs])
            msg = '[info] shift avg: {:.2f}\t shift stdev: {:.2f}'
            print msg.format(rankvals.mean(), rankvals.std())

        out = [{
            'id': d.id,
            'score': d.score,
            'rank': d.rank,
            'relevance': d.relevance,
            'original_rank': d.original_rank
        } for d in sorted(docs, key=lambda o: o.rank)]

        return out
Exemple #29
0
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'thresh': {'type': int, 'default': False},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': True,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'cache-path': {'default': 'cache'},
                   'idf_index': {'default': 'pubmed'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                         r"(\s?([;,]|and)\s)?)+\))|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])

    def run(self, test_data):
        out_results = []
        doc_freq_path = os.path.join(self.opts.cache_path, 'idfidx' +
                                     self.opts.idf_index +
                                     'wp_doc_freq.json')
        if os.path.exists(doc_freq_path):
            with codecs.open(doc_freq_path,
                             'rb',
                             'UTF-8') as mf:
                doc_freq = json.load(mf)
        else:
            doc_freq = {}
        es_int2 = ESAuth(host='devram4.cs.georgetown.edu',
                              index_name=self.opts.idf_index)
        count_docs = es_int2.count(query='*:*')
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
            terms = []
            for t in self.es_int.tokenize(q, 'sentence'):
                if (t not in self.stopwords and
                        t not in authors and
                        not(self.all_digits(t))):
                    if t not in doc_freq.keys():
                        count = es_int2.count(t)
                        if count > 0:
                            idf = log(count_docs / float(count + 1))
                            doc_freq[t] = idf
                            terms.append(t)
                    else:
                        idf = doc_freq[t]
                        terms.append(t)
            avg_idf = np.average([doc_freq[t] for t in terms])
            thresh = avg_idf if self.opts.thresh is not None\
                else self.opts.thresh
            q = ' '.join([t for t in terms
                          if (doc_freq[t] > thresh)])
            if q == '':
                max_idf = -1
                for t in terms:
                    if max_idf < doc_freq[t]:
                        max_idf = doc_freq[t]
                        q = t
            r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                          source_fields=['offset', 'sentence'],
                                          field='sentence',
                                          doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q

            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
        with codecs.open(doc_freq_path,
                         'wb',
                         'UTF-8') as mf:
            json.dump(doc_freq, mf, indent=2)
        return out_results
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 100},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int},
                   'concept_boost': {'default': 3, 'type': int},
                   'np_boost': {'default': 3, 'type': int},
                   'sent_boost': {'default': 1, 'type': int},
                   'stem_boost': {'default': 1, 'type': int},
                   'runmode': {'default': 'train'}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {"syns": ('AUN', 'EQ', 'SYN', 'MTH'),
                     "chemicals": ('CCN', 'CSN'),
                     "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
                     "diseases": ('DI', ), "findings": ('FI', ),
                     "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ),
                     "preferred": ('PTN', 'PT')}
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
#         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()

#         if len(args) > 3:s
#             self.ttys = []
#
#             for tty in args[3:]:
#                 if tty in ttygroups:
#                     self.ttys.extend(ttygroups[tty])
#                 else:
#                     self.ttys.append(tty)

    def expand_concept(self, cdata, synonyms=False):
        rejected_semTypes = {'ftcn', 'qlco', 'qnco', 'inpr'}
        Okay = True
        for st in cdata['SemanticTypes']:
            if st in rejected_semTypes:
                Okay = False
        if Okay:
            if synonyms:
                return self.concept_synonyms(cdata['ConceptId'])
            else:
                return cdata['ConceptId']

    def concept_synonyms(self, cui):
        if cui in evaluate.cachefile:
            return set(evaluate.cachefile[cui])
        else:
            termtypes = ("and (TTY=" +
                         " OR TTY=".join(["'%s'" % x for x in self.ttys]) + ")")
    #         query = 'select * from (select distinct STR from MRCONSO a,'+\
    #                 '(select distinct CUI1,AUI1,AUI2,RELA,CUI2 from MRREL where cui1 = \'%s\'' % cui +\
    #                 ' and rela is not null) b where a.CUI=b.CUI2 and a.LAT=\'ENG\') dd  ;'
            query = "select STR from MRCONSO where " +\
                "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +\
                termtypes + " and (SAB = 'SNOMEDCT_US')"
#             print query
            self.cur.execute(query)

#         self.cur.execute("select STR from MRCONSO where " +
#                          "CUI = '%s' and LAT = 'ENG' and ISPREF = 'Y'" % cui +
#                          termtypes + " and SAB != 'CHV'")

            syns = set(filter(lambda y: y.replace(" ", "").isalpha(),
                              [x.lower() for x, in self.cur.fetchall()]))
            evaluate.cachefile[cui] = list(syns)
            return syns

    def run(self, test_data):
        out_results = []
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            doc_type = doc_type.replace(',', '').replace("'", '"')
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            if self.opts.runmode == 'eval':
                doc_type = doc_type.replace('train', 'eval')

            doc = self.doc_mod.get_doc(
                ann['topic_id'].lower(), ann['citing_article'])
            cit_text = ann['citation_text']
            cit_text_doc = doc[
                ann['citation_offset'][0]:ann['citation_offset'][1]]
            cit_marker = ann['citation_marker']
            cit_marker_doc = doc[
                ann['citation_marker_offset'][0]:ann['citation_marker_offset'][1]]
            cit_mrk_offset_sent = [ann['citation_marker_offset'][0] - ann['citation_offset'][0],
                                   ann['citation_marker_offset'][1] - ann['citation_offset'][0]]
            cleaned = self.reg_apa.sub('', cit_text_doc)
            cleaned = self.reg_ieee.sub('', cleaned)
            cleaned = self.reg_paranthesis.sub('', cleaned)
            cleaned = self.reg_apa_rare.sub('', cleaned)
            cleaned = re.sub('\s+', ' ', cleaned).strip()
            cleaned = re.sub('(,\s)+', ', ', cleaned).strip(', ')

            '''
            -------------- IMMEDIATE NP BEFORE MARKER ----------
            '''
            m = list(self.reg_apa.finditer(cit_text_doc))
            m1 = list(self.reg_ieee.finditer(cit_text_doc))
            m2 = list(self.reg_paranthesis.finditer(cit_text_doc))
            # (start, end, group)
            if len(m) > 0:
                markers = [(e.start(), e.end(), e.group(0)) for e in m]
            elif len(m1) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m1]
            elif len(m2) > 0:
                markers = [(e.start(), e.end(), e.group(0))
                           for e in m2]
            else:
                m3 = list(self.reg_apa_rare.finditer(cit_text_doc))
                if len(m3) > 0:
                    markers = [(e.start(), e.end(), e.group(0))
                               for e in m3]
                else:
                    markers = []

            if len(markers) > 10000:

                nps = self.nlp_extractor.parse_by_mbsp(cleaned.strip())
                if nps is None:
                    q = cleaned
                else:
                    t = nps.split(' ')
                    concepts = []
                    for i in range(len(t)):
                        conc = []
                        toks = t[i].split('/')
                        while(('NP' in toks[2]) and (i < len(t))):
                            conc.append((toks[0], toks[6]))
                            i += 1
                            if i < len(t):
                                toks = t[i].split('/')
                        if len(conc) > 0:
                            concepts.append(conc)
                    noun_phrases = [
                        ' '.join([s1[0] for s1 in t1]) for t1 in concepts]

    #                 nps = self.nlp_extractor.extract_NP(cleaned, mode='flattened')
    #                 nps = [[[a[1:-1] for a in piece] for piece in sent] for sent in nps]
        #             nps = [a[1:-1] for sent in nps for piece in sent for a in piece]
    #                 for e in nps:
    #                     noun_phrases = [(sub_e[0].replace('"', ''),idx) for idx, sub_e in enumerate(e) if sub_e[0].replace('"', '') not in self.stopwords]
                    tokens = self.tokenizer.tokenize(cit_text)
                    tokens_offsets = self.tokenizer.span_tokenize(cit_text_doc)
                    nearest = ''
                    nearest_idx = -1
                    distance = 100000
                    # find nearest word to the citation marker
                    for idx, f in enumerate(tokens_offsets):
                        # check to see if in valid span (not citation markers)
                        invalid = False
                        for e in markers:
                            if f[0] >= e[0] and f[1] <= e[1]:
                                invalid = True
                        if (cit_mrk_offset_sent[0] - f[1] >= 0) and\
                                (cit_mrk_offset_sent[0] - f[1] < distance) and\
                                not invalid:
                            distance = cit_mrk_offset_sent[0] - f[1]
                            if len(re.findall(r"^[^A-Za-z]+$", tokens[idx])) == 0:
                                nearest = tokens[idx]
                                if (idx > 0) and len(re.findall(r"^[^A-Za-z]+$", tokens[idx - 1])) == 0:
                                    nearest = tokens[
                                        idx - 1] + ' ' + tokens[idx]
                                nearest_idx = idx
                        elif (cit_mrk_offset_sent[0] < f[1]):
                            break
                        if len(nearest.split(' ')) == 1 and nearest_idx > 0 and\
                                tokens[nearest_idx] not in stops100:
                            nearest = tokens[idx - 1] + ' ' + tokens[idx]
                    largest = 0
                    q = ''
                    for n in noun_phrases:
                        if (nearest in n) and (len(nearest.split()) > largest):
                            q = '"%s"' % nearest
                            largest = len(nearest.split())
                    if q == '':
                        q = cleaned
                q = sanitize(q)
# find longest noun phrase containing the nearest
#                 res = None
#                 for np in nps[0]:
#                    if nearest in np and len(np) > longest and len(np) < 5:
#                        longest = len(np)
#                        res = np
#                 if res is not None:
#                     res = ' '.join([el for el in res])
#                 else:
#                     res = nearest
            else:
                try:
                    qtxt = unicodedata.normalize('NFKD',
                                                 cleaned).encode('ascii', 'ignore')
                except:
                    qtxt = cleaned.encode('ascii', 'ignore')
                qterms = [qtxt]
                tokens = self.tokenizer.tokenize(' '.join(qterms))
    #             tokens = self.es_int.tokenize(qtxt, analyzer=self.analyzer)
                q = ' '.join([t for t in tokens
                              if (t not in self.stopwords and
                                  not(self.all_digits(t)))])
                if self.opts.concept_boost > 0:

                    qconcepts = mmrun(cleaned)
                    qcids = []
                    for cdata in qconcepts['concepts']:
                        newterms = self.expand_concept(cdata)
                        if newterms is not None:
                            qcids.append(newterms)
                else:
                    qcids = []
                if self.opts.np_boost > 0:
                    nps = self.nlp_extractor.extract_NP(qtxt, mode='flattened')
                    noun_phs = set()
                    for e in nps:
                        for e1 in e:
                            if len(e1) < 4:
                                all_stop = False
                                if self.opts.remove_stopwords:
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1 if sub_e.replace('"', '') not in self.stopwords)
                                else:
                                    count = 0
                                    for sub_e in e1:
                                        if sub_e.replace('"', '') in self.stopwords:
                                            count += 1
                                    if count == len(e1):
                                        all_stop = True
                                    tmp = ' '.join(sub_e.replace('"', '')
                                                   for sub_e in e1)
                                if '"' + tmp.replace('"', '') + '"' not in noun_phs and not all_stop:
                                    noun_phs.add(
                                        '"' + tmp.replace('"', '') + '"')
                else:
                    noun_phs = []

            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                #                 r = self.es_int.multi_field_search(sentence=q,
                #                                                    concepts=' '.join(
                #                                                        [w for w in qcids]),
                #                                                    noun_phrases=' '.join(
                #                                                        [e for e in noun_phs]),
                #                                                    maxsize=self.opts.maxsize,
                #                                                    source_fields=[
                #                                                        'offset', 'sentence', 'mm-concepts', 'noun_phrases'],
                #                                                    doc_type=doc_type,
                #                                                    field_boost=[self.opts.sent_boost,
                #                                                                 self.opts.concept_boost,
                # self.opts.np_boost])
                fields = [
                    'sentence', 'mm-concepts', 'noun_phrases_1', 'stemmed']
                tokens1 = []
                for w in self.tokenizer.tokenize(cleaned):
                    Okay = True
                    if self.opts.remove_stopwords:
                        if w in self.stopwords:
                            Okay = False
                    if '-' in w:
                        tokens1.append(self.stemmer.stem(w.replace('-', '')))
                    if Okay:
                        tokens1.append(self.stemmer.stem(w))
                field_vals = [q, ' '.join([w for w in qcids]),
                              (' '.join([e for e in noun_phs])).replace(
                                  '"', ''),
                              ' '.join([w for w in tokens1])]
                field_boosts = [
                    self.opts.sent_boost, self.opts.concept_boost, self.opts.np_boost, self.opts.stem_boost]
                r = self.es_int.multi_field_search(field_vals=field_vals,
                                                   fields=fields,
                                                   source_fields=[
                                                       'offset', 'sentence'],
                                                   maxsize=self.opts.maxsize,
                                                   field_boost=field_boosts,
                                                   doc_type=doc_type)
#             r = self.es_int.find_all(doc_type=doc_type, source_fields=['offset','sentence'])
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'score': 0,
                          'sentence': [''],
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
                
                
            out_results.append(r)
        return out_results
class Method(MethodInterface):

    """ Produce reference text by submitting the
        citance to the ElasticSearch server.
    """
    method_opts = {'maxsize': {'type': int, 'default': 3},
                   'stopwords-path': {'default': STOPWORDS_PATH},
                   'remove-stopwords': {'default': False,
                                        'action': 'store_true'},
                   'combine': {'default': False, 'action': 'store_true'},
                   'analyzer': {'default': False, 'type': str},
                   'ngram': {'default': False, 'type': int}}

    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)

    def run(self, test_data):
        #         with codecs.open('tmp/test_data.json', 'wb', 'utf-8') as mf:
        #             json.dump(test_data, mf, indent=2)
        out_results = []
        det_res = {}
        for ann in test_data:
            doc_type = '_'.join((ann['topic_id'].lower(),
                                 ann['reference_article'][:-4].lower()))
            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('train', 'eval')
            doc_type = doc_type.replace(',', '').replace("'", '"')

            # TEMPORARY FIX FOR WRONG DOCUMENT TYPE NAME
            doc_type = doc_type.replace('eval', 'train')

            authors = set((ann['reference_article'][:-4].lower().strip(),
                           ann['citing_article'][:-4].lower().strip()))

            # preprocess (removes citations) and tokenizes
            # citation text before submitting to elasticsearch
            q = self.regex_citation('', ann['citation_text'])
            q = q.encode('ascii', 'ignore')
#             tokens = self.es_int.tokenize(q, "sentence")
            tokens = self.tokenizer.tokenize(q)
            tokens = ['"' + t + '"' if '-' in t else t for t in tokens]
            q = ' '.join([t for t in tokens
                          if (t not in self.stopwords and
                              t not in authors and
                              not(self.all_digits(t)))])

            if self.opts.ngram:
                tokens = self.es_int.tokenize(q, "sentence")
                new_query = ''
                for i in range(len(tokens) - self.opts.ngram):
                    tmp = ''
                    for j in range(i, i + self.opts.ngram):
                        tmp += tokens[j] + ' '
                    new_query += '"' + tmp.strip() + '" '
                q = new_query.strip()
#             q = '*:*'
            if self.opts.analyzer:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type,
                                              params={'analyzer': self.opts.analyzer})
            else:
                r = self.es_int.simple_search(q, maxsize=self.opts.maxsize,
                                              source_fields=[
                                                  'offset', 'sentence'],
                                              # field='sentence',
                                              doc_type=doc_type)
            for e in r:
                fld = e.pop('fields')
                e['offset'] = [eval(fld['offset'][0])]
#                 beg = e['offset'][0][0] - \
#                     100 if e['offset'][0][0] else e['offset'][0][0]
#                 end = e['offset'][0][1] + 100
#                 e['offset'] = [(beg, end)]
                e['sentence'] = fld['sentence'][0]
                e['query'] = q
                e['topic'] = ann['topic_id'].lower()

            if self.opts.combine:
                if len(r) == 0:
                    r = [{'_type': doc_type,
                          '_index': self.opts.index_name,
                          '_score': 0,
                          'sentence': '',
                          'offset': [(0, 1)],
                          'query':q, '_id':-11}]
                r = [{'_type': r[0]['_type'],
                      '_index': r[0]['_index'],
                      'query': q,
                      'topic': ann['topic_id'].lower(),
                      'citance_number': ann['citance_number'],
                      'citation_text': ann['citation_text'],
                      'citing_article': ann['citing_article'],
                      '_score': sum([e['_score'] for e in r]),
                      'offset': [e['offset'][0] for e in r],
                      'sentence': [e['sentence'] for e in r],
                      '_id': '-000001'}]
            out_results.append(r)
#         with codecs.open('tmp/out_results.json', 'wb', 'utf-8') as mf:
#             json.dump(out_results, mf, indent=2)
#         sys.exit()
        return out_results
Exemple #32
0
 def __init__(self, index='biosum'):
     self.es_int = ESInterface(index_name=index)