Ejemplo n.º 1
0
def main():
    meta = pd.read_csv(META)
    valid = pd.read_csv(VALID_ID, names=['cord_uid'])
    queries = query_dict(TOPIC)

    es = Elasticsearch([{'host': 'localhost',
                         'port': 9200,
                         'timeout': 3600}])

    mkdir(RUN_DIR)

    qrels = None
    if ROUND == 2:
        qrels = pd.read_csv(QRELS_RND1,
                            sep='\s{1,}',
                            names=['topic', 'Q0', 'docid', 'rel'],
                            index_col=False)

    query(es, meta, valid, queries, qrels, IDX_NAME=SINGLE_IDX)
Ejemplo n.º 2
0
import os
import matchzoo as mz
from core.util import query_dict
from core.clf_mz import train
from config.config import TOPIC, EMBEDDING, EMBED_DIR, BIOWORDVEC

if __name__ == '__main__':

    if EMBEDDING == 'glove':
        embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)
    if EMBEDDING == 'biowordvec':
        embedding = mz.embedding.embedding.load_from_file(os.path.join(
            EMBED_DIR, BIOWORDVEC),
                                                          mode='word2vec')

    for topic_number, query in query_dict(TOPIC).items():

        train(topic_number, embedding, model_type='drmm')
Ejemplo n.º 3
0
from core.util import query_dict, map_sha_path, text

if __name__ == '__main__':

    msp = map_sha_path(DATA)
    meta = pd.read_csv(META)
    meta = meta[meta['sha'].notna(
    )]  # there are duplicates in metadata.csv, some without sha

    # get baseline ranking from run
    df_baseline = pd.read_csv(
        os.path.join(RUN_DIR, BASELINE),
        sep=' ',
        names=['topic', 'Q0', 'cord_uid', 'rank', 'score', 'team'],
        index_col=False)
    queries = query_dict(TOPIC)

    if ROUND == 1:
        if EMBEDDING == 'glove':
            embedding = mz.datasets.embeddings.load_glove_embedding(
                dimension=300)
        if EMBEDDING == 'biowordvec':
            embedding = mz.embedding.embedding.load_from_file(os.path.join(
                EMBED_DIR, BIOWORDVEC),
                                                              mode='word2vec')

        for topic_number, query in queries.items():
            topic_df = df_baseline[df_baseline['topic'] == int(topic_number)]
            cord_uids = topic_df['cord_uid']

            # make datapack
Ejemplo n.º 4
0
def train_data(topic_train):
    queries = query_dict(TOPIC)

    text_left = []
    id_left = []
    text_right = []
    id_right = []
    label = []

    for k, v in queries.items():
        file_path = os.path.join(PUBMED_FETCH, PUBMED_DUMP_DATE, str(k)+'.xml')
        with open(file_path, 'r') as input:
            soup = bs(input.read(), 'lxml')

            if FULLTEXT_PMC:
                articles = soup.find('pmc-articleset').find_all('article')
                for article in articles:
                    pbmid_str = article.find("article-id", {"pub-id-type": "pmc"}).text.replace('\n', ' ').strip()
                    txt = ''
                    abstract = article.abstract
                    if abstract:
                        txt = abstract.text.replace('\n', ' ').strip(' ')
                    sections = article.find_all('sec')
                    titles = article.find_all('article-title')

                    for title in titles:
                        title_text = title.text.replace('\n', ' ').strip(' ')
                        ''.join([txt, ' ', title_text])
                    for section in sections:
                        section_text = section.text.replace('\n', '').strip(' ')
                        ''.join([txt, ' ', section_text])

                    rel = (1 if k == str(topic_train) else 0)
                    id_left.append(str(k))
                    text_left.append(v)
                    id_right.append(pbmid_str)
                    text_right.append(txt)
                    label.append(rel)

            else:
                articles = soup.find_all('pubmedarticle')
                for article in articles:
                    pbmid = article.find('articleid', {"idtype": "pubmed"})
                    pbmid_str = pbmid.text.replace('\n', '').strip()
                    abstract = article.find('abstract')
                    if abstract is None:
                        continue
                    else:
                        abstract_text = abstract.text.replace('\n', '')

                    title = article.articletitle.text.replace('\n', '').strip()
                    txt = title + abstract_text

                    rel = (1 if k == str(topic_train) else 0)
                    id_left.append(str(k))
                    text_left.append(v)
                    id_right.append(pbmid_str)
                    text_right.append(txt)
                    label.append(rel)

    df = pd.DataFrame(data={'text_left': text_left,
                            'id_left': id_left,
                            'text_right': text_right,
                            'id_right': id_right,
                            'label': label})

    return mz.pack(df)