skiplines=0) fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) drugbank_alias_file = FR( "../DRUGBANK/drugbank_extracted_identifiers.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) lines = annotation_file.readlines() annotation_header = lines.pop(0) # print(lines[0]) drug_dict = {} for line in lines: drug_dict[line[0]] = {} for i in range(len(annotation_header)): drug_dict[line[0]][annotation_header[i]] = line[i] drug_dict[line[0]]["FDA_APPROUVED"] = 0 problem_list = [] change_list = []
if __name__ == '__main__': # Load files and data print("reading files") pmid = [] drugs_data = [] drug_data_file = FR("../DRUG_LISTS/full_drug_list.latest.txt", sep=';', suppress_newlines=True, skiplines=0, encoding="utf-8") drugs_data = drug_data_file.readlines() # for line in drug_data_file.iter(): # pmid.append(line[0]) # a = line[1] # if a!="NA" or a!="": # drugs_data.append(a) # Parse xml file. print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db.xml') root = tree.getroot() sample = [ "Erlotinib", "Irinotecan", "Cisplatin", "Pembrolizumab", "Bevacizumab", "(4R)-limonene", "Obinutuzumab", "Rituximab"
dlist_file = FR( "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_LIST 2018 05 17.txt", sep="\t", suppress_newlines=True, skiplines=1, strip_chars_pattern=charstrip, encoding="utf-16") dmatch_file = FR( "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_MATCH 2018 05 27.txt", sep="\t", suppress_newlines=True, skiplines=1, strip_chars_pattern=charstrip, encoding="utf-16") nosym = nosym_file.readlines() supp = supp_file.readlines() dlist = dlist_file.readlines() dmatch = dmatch_file.readlines() # Remove Blank lines for f in [supp, nosym, dlist, dmatch]: for s in f: while "" in s: s.remove("") final_ref = [] pool = [] # Extract lines for f in [supp, nosym, dlist, dmatch]:
from utils import File_Reader as FR from utils import File_Maker as FM from utils import Task_Follower as TF import random import re from string import punctuation strippattern = "^\"|\"$" pubmed_file = FR("../PUBMED_DATA/pubmed_data_2606.txt", sep = "\t", suppress_newlines = True, skiplines = 1, encoding = "CP1252", strip_chars_pattern = strippattern) drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt", sep = ";", suppress_newlines = True, encoding = "utf-8") pubmed = pubmed_file.readlines() drugs = drugs_file.readlines() match = {} for article in pubmed: match[int(article[1])] = ("","",[],"") # tf = TF(len(drugs)) for names in drugs: # tf.step() count=+1 for article in pubmed:
from utils import File_Reader as FR from utils import File_Maker as FM annotation_file = FR("../DRUG_LISTS/drug_pivot_0_1.clean.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt", sep="", suppress_newlines=True, encoding="utf-8") old_drugs = [] annotations = annotation_file.readlines() header = annotations.pop(0) old_drug_dict = {} for line in annotations: old_drug_dict[line[0]] = {} for i in range(len(header)): val = "" if i in range(len(line)): val = line[i] old_drug_dict[line[0]][header[i]] = val print(header) for line in annotations: old_drugs.append(line[0])
sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0) # drugs_file = FR("../PUBMED_DATA/drugbank2606.latest.txt", # sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0) fda_file = FR("../FDA/FDA_DATABASE_2018_07.txt", sep = "", suppress_newlines = True, encoding = "CP1252", skiplines = 1) strippattern = "^\"|\"$|^ +| +$" fda_file2 = FR("../FDA/FDA_DATABASE_2018_07.txt", sep = "\t", suppress_newlines = True, encoding = "CP1252", skiplines = 0, strip_chars_pattern = strippattern) # fda_dict = fda_file.as_dict(lines_askeys = True) fda_lines = fda_file.readlines() fda_dict = fda_file2.as_dict(lines_askeys = True) header,drugs_dict = drugs_file.as_dict(ret_header = True) fda_cols_retained = ["SubmissionStatusDate", "SubmissionStatus", "SponsorName", "ActiveIngredient"] app = "FDA_" header.append("HAS_FDA_ENTRY") for col in fda_cols_retained: header.append(app+col) for key in drugs_dict.keys(): # alias = ";".join([drugs_dict[key]["COMMON_DRUGBANK_ALIAS"], # drugs_dict[key]["MINED_ALIAS"]]) if drugs_dict[key]["MINED_ALIAS"] else drugs_dict[key]["COMMON_DRUGBANK_ALIAS"] alias = drugs_dict[key]["COMMON_DRUGBANK_ALIAS"] found_indexes = [] for line in fda_lines: # if alias.lower() in line.lower(): if any([a.lower() in line.lower() for a in alias.split(";")]):
from utils import File_Reader as FR from utils import File_Maker as FM from utils import Task_Follower as TF import cirpy import pubchempy as pcp from datetime import datetime pivot_file = FR( "../PUBMED_DATA/drugs2606minedalias_with_found_identifiers.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") lines = pivot_file.readlines() print(len(lines)) header = lines.pop(0) def cirpy_getter(drug): c = cirpy.resolve(drug, 'names') if c: return ";".join(c) else: return "" def pcp_getter(drug): p = pcp.get_synonyms(drug, 'name') if p: return ";".join(p[0]["Synonym"])
from utils import File_Reader as FR from utils import File_Maker as FM from utils import head import re from datetime import datetime annotation_file = FR("../DRUG_LISTS/drug_pivot_2606.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") annotations = annotation_file.readlines() header = annotations.pop(0) header.append("DRUGBANK_SYNONYS_AND_PRODUCTS") header.append("DRUGBANK_ID") header.append("CAS_NUMBER") header.append("UNII") header.append("ASSOCIATED_PMID") header.append("OLDEST_PMID") header.append("OLDEST_DATE_OF_PUBLICATION") print(header) pool_data = [] pool_data.append(header) drug_dict = {} for line in annotations: drug_dict[line[0]] = {} for i in range(len(header)):