Beispiel #1
0
"""
Count drug occurences from input file. Format of said file should be checked.
"""

from utils import File_Reader as FR
from utils import File_Maker as FM

file = FR("../PUBMED_DATA/pubmedNdrugs.latest.txt",
          sep='\t',
          suppress_newlines=True,
          skiplines=0,
          encoding="utf-8")

counter = {}

for line in file.iter():
    drug = line[3]
    drug = drug.split(";")[0]
    if drug:
        if drug not in counter:
            counter[drug] = 1
        else:
            counter[drug] += 1

print(counter)

print(len(counter))

s = sorted(counter.items(), key=lambda x: x[1])[::-1]

print(s)
Beispiel #2
0
from utils import File_Reader as FR
from utils import File_Maker as FM
from datetime import datetime

annotation_file = FR("../PUBMED_DATA/drugbank2606.latest.txt",
                     sep="\t",
                     suppress_newlines=True,
                     encoding="utf-8",
                     skiplines=0)

fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt",
              sep="\t",
              suppress_newlines=True,
              encoding="utf-8",
              skiplines=0)

drugbank_alias_file = FR(
    "../DRUGBANK/drugbank_extracted_identifiers.latest.txt",
    sep="\t",
    suppress_newlines=True,
    encoding="utf-8",
    skiplines=0)

lines = annotation_file.readlines()

annotation_header = lines.pop(0)

# print(lines[0])

drug_dict = {}
    for item in node:
        yield (item, level)
        for child in find_rec(item, level + 1):
            yield child


if __name__ == '__main__':

    # Load files and data
    print("reading files")
    pmid = []
    drugs_data = []

    drug_data_file = FR("../DRUG_LISTS/full_drug_list.latest.txt",
                        sep=';',
                        suppress_newlines=True,
                        skiplines=0,
                        encoding="utf-8")

    drugs_data = drug_data_file.readlines()

    # for line in drug_data_file.iter():
    # 	pmid.append(line[0])
    # 	a = line[1]
    # 	if a!="NA" or a!="":
    # 		drugs_data.append(a)

    # Parse xml file.
    print("parsing xml")
    tree = ET.parse('../DRUGBANK/drugbank_db.xml')
    root = tree.getroot()
Beispiel #4
0
"""
Outputs a condendensed drug file from multiple .csv file inputs
"""

from utils import File_Reader as FR

# Readfiles
charstrip = "^|^\"| $|\"$"
nosym_file = FR("../DRUG_LISTS/tbl_Abst_drug_LIST_noSYMBOLS 2018 05 17.txt",
                sep="\t",
                suppress_newlines=True,
                skiplines=1,
                strip_chars_pattern=charstrip,
                encoding="utf-16")
supp_file = FR("../DRUG_LISTS/tbl_Abst_drug_LIST_SUPP 2018 05 31.txt",
               sep="\t",
               suppress_newlines=True,
               skiplines=1,
               strip_chars_pattern=charstrip,
               encoding="utf-16")
dlist_file = FR(
    "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_LIST 2018 05 17.txt",
    sep="\t",
    suppress_newlines=True,
    skiplines=1,
    strip_chars_pattern=charstrip,
    encoding="utf-16")
dmatch_file = FR(
    "../DRUG_LISTS/tbl_Abst_drug_vs_symbols_match - DRUG_MATCH 2018 05 27.txt",
    sep="\t",
    suppress_newlines=True,
'''
Cleaning of the fda database.
Specifically extra trailling spaces and tabs.
'''

from utils import File_Reader as FR
from utils import File_Maker as FM

charstrip = "^[ ]+|[ ]+$"
fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured.txt",
              encoding="utf-16",
              sep="\t",
              strip_chars_pattern=charstrip)

fda = []
for f in fda_file.iter():
    fda.append(f[:5])

print(fda)

file = FM("../FDA/FDA_DRUG_DATABASE_cured_cleaned",
          data_stream=fda,
          extension=".txt")
file.save()

# with open("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", 'w', encoding = 'utf-8') as fp:
# 	for i in fda:
# 		fp.write("\t".join(i)+'\n')
"""
Rasemmblement entre pubmed et liste des drugs
"""

import utils
from utils import File_Reader as FR
from utils import File_Maker as FM
from utils import Task_Follower as TF
import random
import re
from string import punctuation

strippattern = "^\"|\"$"
pubmed_file = FR("../PUBMED_DATA/pubmed_data_2606.txt",
	sep = "\t", suppress_newlines = True, skiplines = 1, encoding = "CP1252",
	strip_chars_pattern = strippattern)

drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt",
	sep = ";", suppress_newlines = True, encoding = "utf-8")

pubmed = pubmed_file.readlines()
drugs = drugs_file.readlines()


match = {}

for article in pubmed:
	match[int(article[1])] = ("","",[],"")


Beispiel #7
0
def char_strip(string, pattern):
    """
	Deletes any characters from string that corresponds to the regular expression (pattern).
	"""
    return re.sub(pattern, '', string)


# Load files and data
print("reading files")
pmid = []
drugs_data = []

drug_data_file = FR("../PUBMED_DATA/pubmedNdrugs_2.txt",
                    sep='\t',
                    suppress_newlines=True,
                    skiplines=0,
                    encoding="utf-16")

for line in drug_data_file.iter():
    pmid.append(line[0])
    a = line[1]
    if a != "NA" or a != "":
        drugs_data.append(a)

# Parse xml file.
print("parsing xml")
tree = ET.parse('../DRUGBANK/drugbank_db.xml')
root = tree.getroot()

# a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))]
Beispiel #8
0
import utils
from utils import File_Reader as FR
from utils import Task_Follower as TF

strippattern = "^\"|\"$"
pubmed_file = FR("../PUBMED_DATA/pubmed_data_2606.txt",
                 sep="\t",
                 suppress_newlines=True,
                 skiplines=1,
                 encoding="CP1252",
                 strip_chars_pattern=strippattern)

drugs_file = FR("../DRUG_LISTS/full_drug_list.txt",
                sep=";",
                suppress_newlines=True,
                encoding="utf-16")

pubmed = pubmed_file.readlines()
drugs = drugs_file.readlines()

utils.head(pubmed, stop=1)

match = {}

for article in pubmed:
    match[int(article[1])] = []

print(len(match))

tf = TF(len(drugs))
Beispiel #9
0
from utils import File_Reader as FR
from utils import File_Maker as FM
from datetime import datetime

drugs_file = FR("../PUBMED_DATA/drugbank2606.latest.txt",
	sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0)

# drugs_file = FR("../PUBMED_DATA/drugbank2606.latest.txt",
# 	sep = "\t", suppress_newlines = True, encoding = "utf-8", skiplines = 0)


fda_file = FR("../FDA/FDA_DATABASE_2018_07.txt",
	sep = "", suppress_newlines = True, encoding = "CP1252", skiplines = 1)

strippattern = "^\"|\"$|^ +| +$"
fda_file2 = FR("../FDA/FDA_DATABASE_2018_07.txt",
	sep = "\t", suppress_newlines = True, encoding = "CP1252", skiplines = 0, strip_chars_pattern = strippattern)


# fda_dict = fda_file.as_dict(lines_askeys = True)
fda_lines = fda_file.readlines()
fda_dict = fda_file2.as_dict(lines_askeys = True)
header,drugs_dict = drugs_file.as_dict(ret_header = True)
fda_cols_retained = ["SubmissionStatusDate", "SubmissionStatus", "SponsorName", "ActiveIngredient"]
app = "FDA_"
header.append("HAS_FDA_ENTRY")
for col in fda_cols_retained:
	header.append(app+col)
for key in drugs_dict.keys():
	# alias = ";".join([drugs_dict[key]["COMMON_DRUGBANK_ALIAS"],
	# 	drugs_dict[key]["MINED_ALIAS"]]) if drugs_dict[key]["MINED_ALIAS"] else drugs_dict[key]["COMMON_DRUGBANK_ALIAS"]
from utils import File_Reader as FR
from utils import File_Maker as FM

annotation_file = FR("../DRUG_LISTS/drug_pivot_0_1.clean.latest.txt",
                     sep="\t",
                     suppress_newlines=True,
                     encoding="utf-8")
drugs_file = FR("../DRUG_LISTS/drug_list_2606_curated_cleaned.latest.txt",
                sep="",
                suppress_newlines=True,
                encoding="utf-8")

old_drugs = []
annotations = annotation_file.readlines()

header = annotations.pop(0)

old_drug_dict = {}
for line in annotations:
    old_drug_dict[line[0]] = {}
    for i in range(len(header)):
        val = ""
        if i in range(len(line)):
            val = line[i]
        old_drug_dict[line[0]][header[i]] = val

print(header)

for line in annotations:
    old_drugs.append(line[0])
Beispiel #11
0
        if match:
            res[index] = True

    return (res)


def char_strip(string, pattern):
    return re.sub(pattern, '', string)


print("reading files")
pmid = []
drugs_data = []

file = FR("../PUBMED_DATA/pubmedNdrugs.txt",
          sep='\t',
          suppress_newlines=True,
          skiplines=1)

for line in file.iter():
    pmid.append(char_strip(line[1], "^\"\"$"))
    a = char_strip(line[2], "\"")
    if a != "NA":
        drugs_data.append(a)

print("parsing xml")
tree = ET.parse('../DRUGBANK/drugbank_db.xml')
root = tree.getroot()

# a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))]
# a = [elem for elem in root.findall("./drug[name='Lepirudin']")]
Beispiel #12
0
from utils import File_Reader as FR
from utils import File_Maker as FM
from utils import Task_Follower as TF
import cirpy
import pubchempy as pcp
from datetime import datetime

pivot_file = FR(
    "../PUBMED_DATA/drugs2606minedalias_with_found_identifiers.latest.txt",
    sep="\t",
    suppress_newlines=True,
    encoding="utf-8")

lines = pivot_file.readlines()
print(len(lines))

header = lines.pop(0)


def cirpy_getter(drug):
    c = cirpy.resolve(drug, 'names')
    if c:
        return ";".join(c)
    else:
        return ""


def pcp_getter(drug):
    p = pcp.get_synonyms(drug, 'name')
    if p:
        return ";".join(p[0]["Synonym"])
Beispiel #13
0
from utils import File_Reader as FR
from utils import File_Maker as FM
from utils import head
import re
from datetime import datetime

annotation_file = FR("../DRUG_LISTS/drug_pivot_2606.latest.txt",
                     sep="\t",
                     suppress_newlines=True,
                     encoding="utf-8")

annotations = annotation_file.readlines()

header = annotations.pop(0)
header.append("DRUGBANK_SYNONYS_AND_PRODUCTS")
header.append("DRUGBANK_ID")
header.append("CAS_NUMBER")
header.append("UNII")
header.append("ASSOCIATED_PMID")
header.append("OLDEST_PMID")
header.append("OLDEST_DATE_OF_PUBLICATION")

print(header)

pool_data = []
pool_data.append(header)

drug_dict = {}
for line in annotations:
    drug_dict[line[0]] = {}
    for i in range(len(header)):
Beispiel #14
0
from utils import File_Reader as FR
from utils import File_Maker as FM
from utils import head
import re

drug_file = FR("../DRUG_LISTS/original_drug_list_2606.txt",
               sep="",
               suppress_newlines=True,
               encoding="utf-8")

modifications_file = FR(
    "../DRUG_LISTS/old_pubmed_with_drugs.latest.manually.annotated.txt",
    sep="\t",
    suppress_newlines=True,
    encoding="utf-8",
    strip_chars_pattern="^\"|\"$",
    skiplines=1)

drugs = []

for line in drug_file.iter():
    drugs.append(line)

for line in modifications_file.iter():

    current_Drugs = line[3].split("|")
    New_alias_identifier = line[5]
    Drug_manually_found_in_0drug_titles = line[6]
    Alias_to_delete = line[7]

    while '' in current_Drugs:
from utils import File_Reader as FR
drugbank_alias_file = FR("../PUBMED_DATA/drugbank2606.latest.txt",
                         sep="\t",
                         suppress_newlines=True,
                         encoding="utf-8",
                         skiplines=0)

drugbank_dict = drugbank_alias_file.as_dict()

drugbank_names = [key for key in drugbank_dict.keys()]
drugbank_names.sort()

for k in drugbank_names:
    print(k)