for c in tqdm.tqdm( session.query(Candidate).filter( Candidate.id.in_(target_ids)).all()): f.write(c.get_parent().text + "\n") # ### Generate Data to Train On # In[ ]: sql = ''' SELECT * from candidate WHERE split = 0 and type='disease_gene' ORDER BY RANDOM() LIMIT 500000; ''' target_cids = [x[0] for x in session.execute(sql)] # In[ ]: offset = 0 with open("data/doc2vec/train_data_500k.txt", "w") as f: while True: cands = session.query(Candidate).filter( Candidate.id.in_(target_cids)).offset(offset).limit(50000).all() if len(cands) == 0: break for c in tqdm.tqdm(cands): f.write(c.get_parent().text + "\n")
GeneGene = candidate_subclass('GeneGene', ['Gene1', 'Gene2']) gge = PretaggedCandidateExtractor(GeneGene, ['Gene', 'Gene']) CompoundGene = candidate_subclass('CompoundGene', ['Compound', 'Gene']) cge = PretaggedCandidateExtractor(CompoundGene, ['Compound', 'Gene']) CompoundDisease = candidate_subclass('CompoundDisease', ['Compound', 'Disease']) cde = PretaggedCandidateExtractor(CompoundDisease, ['Compound', 'Disease']) # In[ ]: # set the seed for reproduction np.random.seed(100) total_sentences = session.execute( "select count(*) from sentence").fetchone()[0] # In[ ]: category_list = np.random.choice([0, 1, 2], total_sentences, p=[0.7, 0.2, 0.1]) # In[ ]: # Divide the sentences into train, dev and test sets #Grab the sentences!!! train_sens = set() dev_sens = set() test_sens = set() offset = 0
LIMIT 10; ''' ] spreadsheet_names = { 'train': 'data/compound_disease/sentence_labels_train.xlsx', 'train_hand_label': 'data/compound_disease/sentence_labels_train_dev.xlsx', 'dev': 'data/compound_disease/sentence_labels_dev.xlsx' } # In[ ]: for sql, spreadsheet_name in zip(sql_statements, spreadsheet_names.values()): target_cids = [x[0] for x in session.execute(sql)] candidates = ( session .query(CandidateClass) .filter(CandidateClass.id.in_(target_cids)) .all() ) candidate_df = make_sentence_df(candidates) write_candidates_to_excel(candidate_df, spreadsheet_name) # # Develop Label Functions # ## Look at potential Candidates # Use this to look at loaded candidates from a given set. The constants represent the index to retrieve the appropiate set. Ideally, here is where one can look at a subset of the candidate and develop label functions for candidate labeling.
ORDER BY RANDOM() LIMIT 50000; ''', ''' SELECT id from candidate WHERE split = 1 and type='disease_gene' ORDER BY RANDOM() LIMIT 10000; ''', ''' SELECT id from candidate WHERE split = 2 and type='disease_gene' ORDER BY RANDOM() LIMIT 10000; ''' ] #Exectue the queries and output them to excel files. session.execute("SELECT setseed(0.5);") for sql, spreadsheet_name in zip(sql_statements, spreadsheet_names.values()): target_cids = [x[0] for x in session.execute(sql)] candidates = ( session .query(Candidate) .filter(Candidate.id.in_(target_cids)) .all() ) candidate_df = make_sentence_df(candidates) write_candidates_to_excel(candidate_df, spreadsheet_name)