""" Count drug occurences from input file. Format of said file should be checked. """ from utils import File_Reader as FR from utils import File_Maker as FM file = FR("../PUBMED_DATA/pubmedNdrugs.latest.txt", sep='\t', suppress_newlines=True, skiplines=0, encoding="utf-8") counter = {} for line in file.iter(): drug = line[3] drug = drug.split(";")[0] if drug: if drug not in counter: counter[drug] = 1 else: counter[drug] += 1 print(counter) print(len(counter)) s = sorted(counter.items(), key=lambda x: x[1])[::-1] print(s)
from utils import File_Reader as FR from utils import File_Maker as FM from datetime import datetime annotation_file = FR("../PUBMED_DATA/drugbank2606.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) drugbank_alias_file = FR( "../DRUGBANK/drugbank_extracted_identifiers.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) lines = annotation_file.readlines() annotation_header = lines.pop(0) # print(lines[0]) drug_dict = {}
for item in node: yield (item, level) for child in find_rec(item, level + 1): yield child if __name__ == '__main__': # Load files and data print("reading files") pmid = [] drugs_data = [] drug_data_file = FR("../DRUG_LISTS/full_drug_list.latest.txt", sep=';', suppress_newlines=True, skiplines=0, encoding="utf-8") drugs_data = drug_data_file.readlines() # for line in drug_data_file.iter(): # pmid.append(line[0]) # a = line[1] # if a!="NA" or a!="": # drugs_data.append(a) # Parse xml file. print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db.xml') root = tree.getroot()
""" Outputs a condendensed drug file from multiple .csv file inputs """ from utils import File_Reader as FR # Readfiles charstrip = "^|^\"| $|\"$" nosym_file = FR("../DRUG_LISTS/tbl_Abst_drug_LIST_noSYMBOLS 2018 05 17.txt", sep="\t", suppress_newlines=True, skiplines=1, strip_chars_pattern=charstrip, encoding="utf-16") supp_file = FR("../DRUG_LISTS/tbl_Abst_drug_LIST_SUPP 2018 05 31.txt", sep="\t", suppress_newlines=True, skiplines=1, strip_chars_pattern=charstrip, encoding="utf-16") dlist_file = FR( "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_LIST 2018 05 17.txt", sep="\t", suppress_newlines=True, skiplines=1, strip_chars_pattern=charstrip, encoding="utf-16") dmatch_file = FR( "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_MATCH 2018 05 27.txt", sep="\t", suppress_newlines=True,
''' Cleaning of the fda database. Specifically extra trailling spaces and tabs. ''' from utils import File_Reader as FR from utils import File_Maker as FM charstrip = "^[ ]+|[ ]+$" fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured.txt", encoding="utf-16", sep="\t", strip_chars_pattern=charstrip) fda = [] for f in fda_file.iter(): fda.append(f[:5]) print(fda) file = FM("../FDA/FDA_DRUG_DATABASE_cured_cleaned", data_stream=fda, extension=".txt") file.save() # with open("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", 'w', encoding = 'utf-8') as fp: # for i in fda: # fp.write("\t".join(i)+'\n')
""" Rasemmblement entre pubmed et liste des drugs """ import utils from utils import File_Reader as FR from utils import File_Maker as FM from utils import Task_Follower as TF import random import re from string import punctuation strippattern = "^\"|\"$" pubmed_file = FR("../PUBMED_DATA/pubmed_data_2606.txt", sep = "\t", suppress_newlines = True, skiplines = 1, encoding = "CP1252", strip_chars_pattern = strippattern) drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt", sep = ";", suppress_newlines = True, encoding = "utf-8") pubmed = pubmed_file.readlines() drugs = drugs_file.readlines() match = {} for article in pubmed: match[int(article[1])] = ("","",[],"")
def char_strip(string, pattern): """ Deletes any characters from string that corresponds to the regular expression (pattern). """ return re.sub(pattern, '', string) # Load files and data print("reading files") pmid = [] drugs_data = [] drug_data_file = FR("../PUBMED_DATA/pubmedNdrugs_2.txt", sep='\t', suppress_newlines=True, skiplines=0, encoding="utf-16") for line in drug_data_file.iter(): pmid.append(line[0]) a = line[1] if a != "NA" or a != "": drugs_data.append(a) # Parse xml file. print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db.xml') root = tree.getroot() # a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))]
import utils from utils import File_Reader as FR from utils import Task_Follower as TF strippattern = "^\"|\"$" pubmed_file = FR("../PUBMED_DATA/pubmed_data_2606.txt", sep="\t", suppress_newlines=True, skiplines=1, encoding="CP1252", strip_chars_pattern=strippattern) drugs_file = FR("../DRUG_LISTS/full_drug_list.txt", sep=";", suppress_newlines=True, encoding="utf-16") pubmed = pubmed_file.readlines() drugs = drugs_file.readlines() utils.head(pubmed, stop=1) match = {} for article in pubmed: match[int(article[1])] = [] print(len(match)) tf = TF(len(drugs))
from utils import File_Reader as FR from utils import File_Maker as FM from datetime import datetime drugs_file = FR("../PUBMED_DATA/drugbank2606.latest.txt", sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0) # drugs_file = FR("../PUBMED_DATA/drugbank2606.latest.txt", # sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0) fda_file = FR("../FDA/FDA_DATABASE_2018_07.txt", sep = "", suppress_newlines = True, encoding = "CP1252", skiplines = 1) strippattern = "^\"|\"$|^ +| +$" fda_file2 = FR("../FDA/FDA_DATABASE_2018_07.txt", sep = "\t", suppress_newlines = True, encoding = "CP1252", skiplines = 0, strip_chars_pattern = strippattern) # fda_dict = fda_file.as_dict(lines_askeys = True) fda_lines = fda_file.readlines() fda_dict = fda_file2.as_dict(lines_askeys = True) header,drugs_dict = drugs_file.as_dict(ret_header = True) fda_cols_retained = ["SubmissionStatusDate", "SubmissionStatus", "SponsorName", "ActiveIngredient"] app = "FDA_" header.append("HAS_FDA_ENTRY") for col in fda_cols_retained: header.append(app+col) for key in drugs_dict.keys(): # alias = ";".join([drugs_dict[key]["COMMON_DRUGBANK_ALIAS"], # drugs_dict[key]["MINED_ALIAS"]]) if drugs_dict[key]["MINED_ALIAS"] else drugs_dict[key]["COMMON_DRUGBANK_ALIAS"]
from utils import File_Reader as FR from utils import File_Maker as FM annotation_file = FR("../DRUG_LISTS/drug_pivot_0_1.clean.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt", sep="", suppress_newlines=True, encoding="utf-8") old_drugs = [] annotations = annotation_file.readlines() header = annotations.pop(0) old_drug_dict = {} for line in annotations: old_drug_dict[line[0]] = {} for i in range(len(header)): val = "" if i in range(len(line)): val = line[i] old_drug_dict[line[0]][header[i]] = val print(header) for line in annotations: old_drugs.append(line[0])
if match: res[index] = True return (res) def char_strip(string, pattern): return re.sub(pattern, '', string) print("reading files") pmid = [] drugs_data = [] file = FR("../PUBMED_DATA/pubmedNdrugs.txt", sep='\t', suppress_newlines=True, skiplines=1) for line in file.iter(): pmid.append(char_strip(line[1], "^\"\"$")) a = char_strip(line[2], "\"") if a != "NA": drugs_data.append(a) print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db.xml') root = tree.getroot() # a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))] # a = [elem for elem in root.findall("./drug[name='Lepirudin']")]
from utils import File_Reader as FR from utils import File_Maker as FM from utils import Task_Follower as TF import cirpy import pubchempy as pcp from datetime import datetime pivot_file = FR( "../PUBMED_DATA/drugs2606minedalias_with_found_identifiers.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") lines = pivot_file.readlines() print(len(lines)) header = lines.pop(0) def cirpy_getter(drug): c = cirpy.resolve(drug, 'names') if c: return ";".join(c) else: return "" def pcp_getter(drug): p = pcp.get_synonyms(drug, 'name') if p: return ";".join(p[0]["Synonym"])
from utils import File_Reader as FR from utils import File_Maker as FM from utils import head import re from datetime import datetime annotation_file = FR("../DRUG_LISTS/drug_pivot_2606.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8") annotations = annotation_file.readlines() header = annotations.pop(0) header.append("DRUGBANK_SYNONYS_AND_PRODUCTS") header.append("DRUGBANK_ID") header.append("CAS_NUMBER") header.append("UNII") header.append("ASSOCIATED_PMID") header.append("OLDEST_PMID") header.append("OLDEST_DATE_OF_PUBLICATION") print(header) pool_data = [] pool_data.append(header) drug_dict = {} for line in annotations: drug_dict[line[0]] = {} for i in range(len(header)):
from utils import File_Reader as FR from utils import File_Maker as FM from utils import head import re drug_file = FR("../DRUG_LISTS/original_drug_list_2606.txt", sep="", suppress_newlines=True, encoding="utf-8") modifications_file = FR( "../DRUG_LISTS/old_pubmed_with_drugs.latest.manually.annotated.txt", sep="\t", suppress_newlines=True, encoding="utf-8", strip_chars_pattern="^\"|\"$", skiplines=1) drugs = [] for line in drug_file.iter(): drugs.append(line) for line in modifications_file.iter(): current_Drugs = line[3].split("|") New_alias_identifier = line[5] Drug_manually_found_in_0drug_titles = line[6] Alias_to_delete = line[7] while '' in current_Drugs:
from utils import File_Reader as FR drugbank_alias_file = FR("../PUBMED_DATA/drugbank2606.latest.txt", sep="\t", suppress_newlines=True, encoding="utf-8", skiplines=0) drugbank_dict = drugbank_alias_file.as_dict() drugbank_names = [key for key in drugbank_dict.keys()] drugbank_names.sort() for k in drugbank_names: print(k)