def main(): meta = pd.read_csv(META) valid = pd.read_csv(VALID_ID, names=['cord_uid']) queries = query_dict(TOPIC) es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 3600}]) mkdir(RUN_DIR) qrels = None if ROUND == 2: qrels = pd.read_csv(QRELS_RND1, sep='\s{1,}', names=['topic', 'Q0', 'docid', 'rel'], index_col=False) query(es, meta, valid, queries, qrels, IDX_NAME=SINGLE_IDX)
import os import matchzoo as mz from core.util import query_dict from core.clf_mz import train from config.config import TOPIC, EMBEDDING, EMBED_DIR, BIOWORDVEC if __name__ == '__main__': if EMBEDDING == 'glove': embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) if EMBEDDING == 'biowordvec': embedding = mz.embedding.embedding.load_from_file(os.path.join( EMBED_DIR, BIOWORDVEC), mode='word2vec') for topic_number, query in query_dict(TOPIC).items(): train(topic_number, embedding, model_type='drmm')
from core.util import query_dict, map_sha_path, text if __name__ == '__main__': msp = map_sha_path(DATA) meta = pd.read_csv(META) meta = meta[meta['sha'].notna( )] # there are duplicates in metadata.csv, some without sha # get baseline ranking from run df_baseline = pd.read_csv( os.path.join(RUN_DIR, BASELINE), sep=' ', names=['topic', 'Q0', 'cord_uid', 'rank', 'score', 'team'], index_col=False) queries = query_dict(TOPIC) if ROUND == 1: if EMBEDDING == 'glove': embedding = mz.datasets.embeddings.load_glove_embedding( dimension=300) if EMBEDDING == 'biowordvec': embedding = mz.embedding.embedding.load_from_file(os.path.join( EMBED_DIR, BIOWORDVEC), mode='word2vec') for topic_number, query in queries.items(): topic_df = df_baseline[df_baseline['topic'] == int(topic_number)] cord_uids = topic_df['cord_uid'] # make datapack
def train_data(topic_train): queries = query_dict(TOPIC) text_left = [] id_left = [] text_right = [] id_right = [] label = [] for k, v in queries.items(): file_path = os.path.join(PUBMED_FETCH, PUBMED_DUMP_DATE, str(k)+'.xml') with open(file_path, 'r') as input: soup = bs(input.read(), 'lxml') if FULLTEXT_PMC: articles = soup.find('pmc-articleset').find_all('article') for article in articles: pbmid_str = article.find("article-id", {"pub-id-type": "pmc"}).text.replace('\n', ' ').strip() txt = '' abstract = article.abstract if abstract: txt = abstract.text.replace('\n', ' ').strip(' ') sections = article.find_all('sec') titles = article.find_all('article-title') for title in titles: title_text = title.text.replace('\n', ' ').strip(' ') ''.join([txt, ' ', title_text]) for section in sections: section_text = section.text.replace('\n', '').strip(' ') ''.join([txt, ' ', section_text]) rel = (1 if k == str(topic_train) else 0) id_left.append(str(k)) text_left.append(v) id_right.append(pbmid_str) text_right.append(txt) label.append(rel) else: articles = soup.find_all('pubmedarticle') for article in articles: pbmid = article.find('articleid', {"idtype": "pubmed"}) pbmid_str = pbmid.text.replace('\n', '').strip() abstract = article.find('abstract') if abstract is None: continue else: abstract_text = abstract.text.replace('\n', '') title = article.articletitle.text.replace('\n', '').strip() txt = title + abstract_text rel = (1 if k == str(topic_train) else 0) id_left.append(str(k)) text_left.append(v) id_right.append(pbmid_str) text_right.append(txt) label.append(rel) df = pd.DataFrame(data={'text_left': text_left, 'id_left': id_left, 'text_right': text_right, 'id_right': id_right, 'label': label}) return mz.pack(df)