Ejemplo n.º 1
0
"""
Count drug occurences from input file. Format of said file should be checked.
"""

from utils import File_Reader as FR
from utils import File_Maker as FM

file = FR("../PUBMED_DATA/pubmedNdrugs.latest.txt",
          sep='\t',
          suppress_newlines=True,
          skiplines=0,
          encoding="utf-8")

counter = {}

for line in file.iter():
    drug = line[3]
    drug = drug.split(";")[0]
    if drug:
        if drug not in counter:
            counter[drug] = 1
        else:
            counter[drug] += 1

print(counter)

print(len(counter))

s = sorted(counter.items(), key=lambda x: x[1])[::-1]

print(s)
Ejemplo n.º 2
0
'''
Cleaning of the fda database.
Specifically extra trailling spaces and tabs.
'''

from utils import File_Reader as FR
from utils import File_Maker as FM

charstrip = "^[ ]+|[ ]+$"
fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured.txt",
              encoding="utf-16",
              sep="\t",
              strip_chars_pattern=charstrip)

fda = []
for f in fda_file.iter():
    fda.append(f[:5])

print(fda)

file = FM("../FDA/FDA_DRUG_DATABASE_cured_cleaned",
          data_stream=fda,
          extension=".txt")
file.save()

# with open("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", 'w', encoding = 'utf-8') as fp:
# 	for i in fda:
# 		fp.write("\t".join(i)+'\n')
Ejemplo n.º 3
0
# Search drugbank for alternative aliases from CIR and Pubchem

alternative_alias_file = FR("../DRUG_LISTS/pcp_cir_newalias.latest.txt",
                            sep="\t",
                            suppress_newlines=True,
                            encoding="utf-8")

drugbank_db_file = FR("../DRUGBANK/drugbank_extracted_identifiers.latest.txt",
                      skiplines=1,
                      suppress_newlines=True,
                      encoding="utf-8")

drugbank = drugbank_db_file.readlines()

count = 0
for new in alternative_alias_file.iter():
    key = new[0]

    # print(key)

    if new[1] or new[2]:
        a = ''
        if new[1]:
            a = new[1].lower().split(";")
        b = ''
        if new[2]:
            b = new[2].lower().split(";")
        aliases = []
        if a:
            aliases.extend(a)
        if b:
Ejemplo n.º 4
0
	"""
    return re.sub(pattern, '', string)


# Load files and data
print("reading files")
pmid = []
drugs_data = []

drug_data_file = FR("../PUBMED_DATA/pubmedNdrugs_2.txt",
                    sep='\t',
                    suppress_newlines=True,
                    skiplines=0,
                    encoding="utf-16")

for line in drug_data_file.iter():
    pmid.append(line[0])
    a = line[1]
    if a != "NA" or a != "":
        drugs_data.append(a)

# Parse xml file.
print("parsing xml")
tree = ET.parse('../DRUGBANK/drugbank_db.xml')
root = tree.getroot()

# a = [elem for elem in root.findall(get_xpath("by_PMID", keyword = "26242220"))]
# a = [elem for elem in root.findall("./drug[name='Lepirudin']")]

res = []
print("search")
Ejemplo n.º 5
0
drug_file = FR("../DRUG_LISTS/original_drug_list_2606.txt",
               sep="",
               suppress_newlines=True,
               encoding="utf-8")

modifications_file = FR(
    "../DRUG_LISTS/old_pubmed_with_drugs.latest.manually.annotated.txt",
    sep="\t",
    suppress_newlines=True,
    encoding="utf-8",
    strip_chars_pattern="^\"|\"$",
    skiplines=1)

drugs = []

for line in drug_file.iter():
    drugs.append(line)

for line in modifications_file.iter():

    current_Drugs = line[3].split("|")
    New_alias_identifier = line[5]
    Drug_manually_found_in_0drug_titles = line[6]
    Alias_to_delete = line[7]

    while '' in current_Drugs:
        current_Drugs.pop(current_Drugs.index(''))

    if Alias_to_delete in drugs and Alias_to_delete:
        drugs.pop(drugs.index(Alias_to_delete))
    else: