Esempio n. 1
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
                                         r"(\[(\d+([,–]\s?)?)+\])|"
                                         r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
        self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+")
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)")
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
Esempio n. 2
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)

        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)
        self.analyzer = self.es_int.get_index_analyzer()
        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.db = MySQLdb.connect(host=constants.mysql_server,
                                  port=constants.mysql_port,
                                  user=constants.mysql_user,
                                  passwd=constants.mysql_pass,
                                  db=constants.mysql_db)
        self.cur = self.db.cursor()
        self.ttys = ['SY']

        ttygroups = {
            "syns": ('AUN', 'EQ', 'SYN', 'MTH'),
            "chemicals": ('CCN', 'CSN'),
            "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'),
            "diseases": ('DI', ),
            "findings": ('FI', ),
            "hierarchy": ('HS', 'HT', 'HX'),
            "related": ('RT', ),
            "preferred": ('PTN', 'PT')
        }
        self.doc_mod = documents_model.DocumentsModel(opts.anns_dir)
        #         self.ann_client = AnnotationsClient()

        self.reg_apa = re.compile(
            # [Chen et al.2000]
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|"
            r"\w+\set al\. \(\d{2,4}\)")  # [Chen et al. 200]
        self.reg_apa_rare = re.compile(
            r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+"
        )
        self.reg_apa2 = re.compile(
            r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)"
        )
        self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]")
        self.reg_paranthesis = re.compile(
            r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)")
        self.nlp_extractor = Extract_NLP_Tags()
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
        self.lmtzr = WordNetLemmatizer()
        self.stemmer = stem.porter.PorterStemmer()
Esempio n. 3
0
    def __init__(self, index_name=constants.bm25_index):
        '''
        default constuctor

        Args:
            index_name(str): The elasticsearch index name that will
                be used to retrieve documents and idfs
        '''
        self.es_int = ESInterface(index_name=index_name)
        print self.es_int.get_avg_size('sentence')
        self.avg_doc_length = -1
Esempio n. 4
0
def dump_stats(dump_stats_data, dump_path, index_name):

    es_int = ESInterface(index_name=index_name)
    csv_line = []

    for ann, res in dump_stats_data:
        csv_line.extend([[
            ann['topic_id'], ann['citing_article'][:-4].lower(),
            ann['reference_article'][:-4].lower(), ann['discourse_facet']
        ], [''], [ann['citation_text'].encode('ascii', 'ignore')], ['']])
        offsets = chain(*[[s[0], s[1], '']
                          for s in sorted(ann['reference_offset'].keys(),
                                          key=lambda t: t[0])])
        csv_line.extend([list(offsets), ['']])
        csv_line.append(['prec:'])
        csv_line.extend([list(t) for t in calculate_ap([res], [ann]).items()])
        csv_line.append(['ndcg:'])
        csv_line.extend(
            [list(t) for t in calculate_ndcg([res], [ann]).items()])
        csv_line.append([''])
        for i, r in enumerate(res, start=1):
            rel = str(calculate_ndcg([[r]], [ann])['all'] > 0).upper()

            # temp until Arman fixes bug
            txt = es_int.get_page_by_res(r)['sentence'].encode(
                'ascii', 'ignore')
            offset = str(
                es_int.get_page_by_res(r)['offset']).strip('()').split(', ')
            csv_line.extend([[txt],
                             [
                                 'rank', i, '', 'offset', offset[0], offset[1],
                                 '', 'rel?', rel
                             ]])

            # commented until bugs fixed
#             txt = []
#             for offset in r['offset']:
#                 txt.append(ann_cl.get_doc('_'.join(r['_type'].split('_')[:2]),
#                                           r['_type'].split('_')[2], offset))
#             txt = ' ... '.join(txt)
#             csv_line.extend([[txt], ['rank', i, '', 'offset',
#                                      r['offset'][0][0], r['offset'][0][1],
#                                      '', 'rel?', rel]])
#             csv_line.append([''])
        csv_line.extend([[''], ['']])

    with file(dump_path, 'wb') as csv_file:
        wr = csv.writer(csv_file)
        wr.writerows(csv_line)
Esempio n. 5
0
 def __init__(self, args, opts):
     super(Method, self).__init__(args, opts)
     self.es_int = ESInterface(host=self.opts.server,
                               port=self.opts.port,
                               index_name=self.opts.index_name)
     self.regex_citation = re.compile(r"(\(\s?(([A-Za-z]+\.?\s?)+,? \d+"
                                      r"(\s?([;,]|and)\s)?)+\))|"
                                      r"(\[(\d+([,–]\s?)?)+\])|"
                                      r"\[[\d,-]+\]").sub
     self.all_digits = re.compile(r"^\d+$").search
     if self.opts.remove_stopwords:
         with file(self.opts.stopwords_path) as f:
             self.stopwords = frozenset([l.strip().lower() for l in f])
     else:
         self.stopwords = frozenset([])
Esempio n. 6
0
 def __init__(self, args, opts):
     super(Method, self).__init__(args, opts)
     self.es_int = ESInterface(host=self.opts.server,
                               port=self.opts.port,
                               index_name=self.opts.index_name)
     self.regex_citation = re.compile(
         r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
         r"(\[(\d+([,–]\s?)?)+\])|"
         r"\[[\d,-]+\]").sub
     self.all_digits = re.compile(r"^\d+$").search
     if self.opts.remove_stopwords:
         with file(self.opts.stopwords_path) as f:
             self.stopwords = frozenset([l.strip().lower() for l in f])
     else:
         self.stopwords = frozenset([])
     self.doc_mod = documents_model.DocumentsModel(opts.docs_path)
     self.ann_client = AnnotationsClient()
    def __init__(self,
                 filter_list=None,
                 base_cache=None,
                 cachedir='cache',
                 eshost=None,
                 esport=None,
                 esindex=None,
                 sim_func=CosineSimilarity(),
                 stopwords=None,
                 weighted=False,
                 query_terms_only=False):

        if not eshost:
            eshost = 'localhost'
        if not esport:
            esport = 9200
        if not esindex:
            esindex = 'pubmed'

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)
        self.cachedir = cachedir
        self.sim_func = sim_func
        self.timer = Timer(prefix='[timer]')
        self.weighted = weighted
        self.query_terms_only = query_terms_only
        self.base_cache = base_cache

        if not stopwords:
            stopwords = set()
        self._stopwords = stopwords

        if filter_list:
            filter_list = set([e for e in filter_list if e not in stopwords])
        self._filter_list = filter_list

        # calculate figerprint to use as cache comment!
        finger_text = ' '.join([
            w for w in set.union((self._filter_list or set()), self._stopwords)
        ])
        finger_md5 = md5()
        finger_md5.update(finger_text.encode('utf-8'))
        self.finger_filter = finger_md5.hexdigest()
Esempio n. 8
0
    def __init__(self, args, opts):
        super(Method, self).__init__(args, opts)
        self.es_int = ESInterface(host=self.opts.server,
                                  port=self.opts.port,
                                  index_name=self.opts.index_name)

        self.regex_citation = re.compile(
            r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|"
            r"(\[(\d+([,–]\s?)?)+\])|"
            r"\[[\d,-]+\]").sub
        self.all_digits = re.compile(r"^\d+$").search
        if self.opts.stopwords_path:
            stop_path = self.opts.stopwords_path
        else:
            stop_path = STOPWORDS_PATH
        if self.opts.remove_stopwords:
            with file(self.opts.stopwords_path) as f:
                self.stopwords = frozenset([l.strip().lower() for l in f])
        else:
            self.stopwords = frozenset([])
        self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
Esempio n. 9
0
    def __init__(self,
                 documents,
                 eshost='localhost',
                 esport=9200,
                 esindex='pubmed21',
                 cachedir='cache'):

        self.cachedir = cachedir
        self.questions = documents
        self.categories = None

        self.added = dict([(qid, []) for qid in self.questions.keys()])
        self.removed = dict([(qid, []) for qid in self.questions.keys()])

        self.es = ESInterface(host=eshost, port=esport, index_name=esindex)

        self.tokenquestions = self.tokenize_questions(self.questions.items())
        self.tokquestions = dict([(k, " ".join(v))
                                  for k, v in self.tokenquestions.iteritems()])

        self.run()
Esempio n. 10
0
 def __init__(self, index='biosum'):
     self.es_int = ESInterface(index_name=index)
Esempio n. 11
0
 def __init__(self, cache_index='cache'):
     self.es_int = ESInterface(index_name=cache_index)