""" Count drug occurences from input file. Format of said file should be checked. """ from utils import File_Reader as FR from utils import File_Maker as FM file = FR("../PUBMED_DATA/pubmedNdrugs.latest.txt", sep='\t', suppress_newlines=True, skiplines=0, encoding="utf-8") counter = {} for line in file.iter(): drug = line[3] drug = drug.split(";")[0] if drug: if drug not in counter: counter[drug] = 1 else: counter[drug] += 1 print(counter) print(len(counter)) s = sorted(counter.items(), key=lambda x: x[1])[::-1] print(s)
''' Cleaning of the fda database. Specifically extra trailling spaces and tabs. ''' from utils import File_Reader as FR from utils import File_Maker as FM charstrip = "^[ ]+|[ ]+$" fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured.txt", encoding="utf-16", sep="\t", strip_chars_pattern=charstrip) fda = [] for f in fda_file.iter(): fda.append(f[:5]) print(fda) file = FM("../FDA/FDA_DRUG_DATABASE_cured_cleaned", data_stream=fda, extension=".txt") file.save() # with open("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", 'w', encoding = 'utf-8') as fp: # for i in fda: # fp.write("\t".join(i)+'\n')
# Search drugbank for alternative aliases from CIR and Pubchem alternative_alias_file = FR("../DRUG_LISTS/pcp_cir_newalias.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") drugbank_db_file = FR("../DRUGBANK/drugbank_extracted_identifiers.latest.txt", skiplines=1, suppress_newlines=True, encoding="utf-8") drugbank = drugbank_db_file.readlines() count = 0 for new in alternative_alias_file.iter(): key = new[0] # print(key) if new[1] or new[2]: a = '' if new[1]: a = new[1].lower().split(";") b = '' if new[2]: b = new[2].lower().split(";") aliases = [] if a: aliases.extend(a) if b:
""" return re.sub(pattern, '', string) # Load files and data print("reading files") pmid = [] drugs_data = [] drug_data_file = FR("../PUBMED_DATA/pubmedNdrugs_2.txt", sep='\t', suppress_newlines=True, skiplines=0, encoding="utf-16") for line in drug_data_file.iter(): pmid.append(line[0]) a = line[1] if a != "NA" or a != "": drugs_data.append(a) # Parse xml file. print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db.xml') root = tree.getroot() # a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))] # a = [elem for elem in root.findall("./drug[name='Lepirudin']")] res = [] print("search")
drug_file = FR("../DRUG_LISTS/original_drug_list_2606.txt", sep="", suppress_newlines=True, encoding="utf-8") modifications_file = FR( "../DRUG_LISTS/old_pubmed_with_drugs.latest.manually.annotated.txt", sep="\t", suppress_newlines=True, encoding="utf-8", strip_chars_pattern="^\"|\"$", skiplines=1) drugs = [] for line in drug_file.iter(): drugs.append(line) for line in modifications_file.iter(): current_Drugs = line[3].split("|") New_alias_identifier = line[5] Drug_manually_found_in_0drug_titles = line[6] Alias_to_delete = line[7] while '' in current_Drugs: current_Drugs.pop(current_Drugs.index('')) if Alias_to_delete in drugs and Alias_to_delete: drugs.pop(drugs.index(Alias_to_delete)) else: