Ejemplo n.º 1
0
    def search(self, *args, **kwds):
        for path in Folder.listdir(self.dirname, **kwds):
            if path.lower().endswith('.pdf'):
                path = pdf2txt(path)

            yield [{
                'keyword': term,
                'match': match,
                'path': path
            } for term in self.keywords for match in self.get_matches(path)
                   if term in match]
Ejemplo n.º 2
0
def main(generator=False):
    os.chdir(path)
    # Version:
    url = 'http://thebiogrid.org/'
    f = urllib.urlopen(url)
    contents = f.read()
    f.close()
    lines = contents.split('\n')
    for line in lines:
        if '<div class="newspost-title">BioGRID Version' in line:
            print line
            VERSION = line.split('<div class="newspost-title">BioGRID Version ')[1].split(' Release ')[0]
            break
    print VERSION

    # Urls:
    url = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-'
    tab2url = url+VERSION+'/BIOGRID-ALL-'+VERSION+'.tab2.zip'
    mitaburl = url+VERSION+'/BIOGRID-ALL-'+VERSION+'.mitab.zip'

    # Files:
    folder = Folder()
    folder.get([tab2url, mitaburl])

    tab2 = folder.contains('tab2')[0].parse(printing=False, seperator=None)
    mitab = folder.contains('mitab')[0].parse(printing=False, seperator=None)

    # Parsing:
    header = tab2[0].split('\t')
    D = {}
    output = open(os.path.join(path, 'interactions.txt'), 'w')

    for x in xrange(0, len(tab2)):
        line = tab2[x]
        if "#BioGRID Interaction ID" not in line and line != "":
            columns = line.split('\t')
            if int(columns[0]) not in D: D[int(columns[0])] = {}
            systematic_name_intactor_a = columns[5]
            systematic_name_intactor_b = columns[6]
            official_gene_symbol_a = columns[7]
            official_gene_symbol_b = columns[8]
            synonymns_interactor_a = columns[9].split('|')
            synonymns_interactor_b = columns[10].split('|')
            
            AliasA, AliasB = [columns[1]], [columns[2]]

            if systematic_name_intactor_a != '-' and systematic_name_intactor_a not in AliasA: AliasA.append(systematic_name_intactor_a)
            if systematic_name_intactor_b != '-' and systematic_name_intactor_b not in AliasB: AliasB.append(systematic_name_intactor_b)
            if official_gene_symbol_a != '-' and official_gene_symbol_a not in AliasA: AliasA.append(official_gene_symbol_a)
            if official_gene_symbol_b != '-' and official_gene_symbol_b not in AliasB: AliasB.append(official_gene_symbol_b)
            for i in synonymns_interactor_a:
                if i != "-" and i not in AliasA: AliasA.append(i)
            for i in synonymns_interactor_b:
                if i != "-" and i not in AliasB: AliasB.append(i)

            experimental_system_type = [columns[12], 'direct']
            experimental_system = columns[11]

            type = mitab[x].split('\t')[11].split('(')[1].split(')')[0]
            
            pmid = int(columns[14])
            taxid_a = int(columns[15])
            if taxid_a == 559292: taxid_a = 4932
            taxid_b = int(columns[16])
            if taxid_b == 559292: taxid_b = 4932
            throughput = columns[17].split('|')
            if columns[19] != '-':
                modification = columns[19]
                #print modification
            else: modification =  ''
            source = columns[23]   #'BioGRID'

            r = '\t'.join(['; '.join(AliasA), '; '.join(AliasB), '; '.join(experimental_system_type), experimental_system, type,  modification, str(taxid_a), str(taxid_b), str(pmid), source+'\n'])
            output.write(r)
                    
            D[int(columns[0])][int(columns[1])] = {'source':source,
                                       'experimental_system':experimental_system,
                                       'experimental_system_type':experimental_system_type,
                                       'taxid_a':taxid_a,
                                       'taxid_b':taxid_b,
                                       'pmid':pmid,
                                       'throughput':throughput}
    output.close()
    new_entry = '\n%s %s' % (VERSION, datetime.datetime.now())
    version_file = open(os.path.join(path, 'version.txt'), 'a')
    version_file.write(new_entry)
    version_file.close()

    #Cleaning up:
    filenames = os.listdir(path)
    for filename in filenames:
        if "BIOGRID" in filename:
            print("Deleting %s" % filename)
            os.remove(os.path.join(filename))
Ejemplo n.º 3
0
def main(interactions=False,
         download=True,
         parse=True,
         withdrawn=True,
         cleanup=True):
    """Performs the download of interaction and annotation files from MGI.
    Builds a gene annotation file and mapping tables.
    TODO:
    - Inspect and eventually use interaction file, else discard from this module.
    - Also check whether other information from MGI is worse to integrate
      such as homology or phenotypes."""
    os.chdir(path)
    genes.name = 'MGI'
    genes.key = 'mgi'
    folder = Folder(path)

    if interactions:
        ftp = FTP(
            url='ftp://ftp.informatics.jax.org/pub/protein-interaction-data/',
            path=path)
        ftp.download(path)

    if download:
        url = "ftp://ftp.informatics.jax.org/pub/reports/"
        files = [
            "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt",
            "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt",
            "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt"
        ]
        # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt
        for f in files:
            f = File(url=url + f)  # automatically does f.download()
            res = f.parse(header=True, printing=False)
            folder.downloads.append(f.name)

    if parse:
        folder.update()
        if withdrawn: filename = "MRK_List1.rpt"
        else: filename = "MRK_List2.rpt"
        data = folder[filename].parse(header=True, printing=False)
        genes.addData(data, key='mgi', taxid=10090)

        data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False)
        for i in data:
            i = change_keys(i)
            i['taxid'] = 10090
            genes.add(i)

        data = folder['MRK_Sequence.rpt'].parse(header=True, printing=False)
        genes.addData(data, key='mgi', taxid=10090)

        header = "mgi symbol status name cm_position chromosome	type "\
        "secondary_accession_ids id synonyms feature_types start "\
        "stop strand biotypes".split()
        data = folder["MGI_EntrezGene.rpt"].parse(header=header,
                                                  printing=False)
        genes.addData(data, key="mgi", taxid=10090)
        print len(genes)

    if cleanup:
        if interactions: ftp.remove(confirm=False)
        for f in folder.downloads:
            folder.remove(f)

    genes.keep("category", "Gene")
    genes.remove("name", "withdrawn")
    genes.save()
    genes.buildMappings()
Ejemplo n.º 4
0
 def func(args):
     return Folder.table(args.dirname, **args.kwds)
Ejemplo n.º 5
0
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True):
    """Performs the download of interaction and annotation files from MGI.
    Builds a gene annotation file and mapping tables.
    TODO:
    - Inspect and eventually use interaction file, else discard from this module.
    - Also check whether other information from MGI is worse to integrate
      such as homology or phenotypes."""
    os.chdir(path)
    genes.name = "MGI"
    genes.key = "mgi"
    folder = Folder(path)

    if interactions:
        ftp = FTP(url="ftp://ftp.informatics.jax.org/pub/protein-interaction-data/", path=path)
        ftp.download(path)

    if download:
        url = "ftp://ftp.informatics.jax.org/pub/reports/"
        files = [
            "MRK_List1.rpt",
            "MRK_List2.rpt",
            "MGI_Coordinate.rpt",
            "MRK_Sequence.rpt",
            "MRK_SwissProt_TrEMBL.rpt",
            "MRK_VEGA.rpt",
            "MRK_ENSEMBL.rpt",
            "MGI_EntrezGene.rpt",
        ]
        # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt
        for f in files:
            f = File(url=url + f)  # automatically does f.download()
            res = f.parse(header=True, printing=False)
            folder.downloads.append(f.name)

    if parse:
        folder.update()
        if withdrawn:
            filename = "MRK_List1.rpt"
        else:
            filename = "MRK_List2.rpt"
        data = folder[filename].parse(header=True, printing=False)
        genes.addData(data, key="mgi", taxid=10090)

        data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False)
        for i in data:
            i = change_keys(i)
            i["taxid"] = 10090
            genes.add(i)

        data = folder["MRK_Sequence.rpt"].parse(header=True, printing=False)
        genes.addData(data, key="mgi", taxid=10090)

        header = (
            "mgi symbol status name cm_position chromosome	type "
            "secondary_accession_ids id synonyms feature_types start "
            "stop strand biotypes".split()
        )
        data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False)
        genes.addData(data, key="mgi", taxid=10090)
        print len(genes)

    if cleanup:
        if interactions:
            ftp.remove(confirm=False)
        for f in folder.downloads:
            folder.remove(f)

    genes.keep("category", "Gene")
    genes.remove("name", "withdrawn")
    genes.save()
    genes.buildMappings()
Ejemplo n.º 6
0
def main(generator=False):
    os.chdir(path)
    # Version:
    url = 'http://thebiogrid.org/'
    f = urllib.urlopen(url)
    contents = f.read()
    f.close()
    lines = contents.split('\n')
    for line in lines:
        if '<div class="newspost-title">BioGRID Version' in line:
            print line
            VERSION = line.split('<div class="newspost-title">BioGRID Version '
                                 )[1].split(' Release ')[0]
            break
    print VERSION

    # Urls:
    url = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-'
    tab2url = url + VERSION + '/BIOGRID-ALL-' + VERSION + '.tab2.zip'
    mitaburl = url + VERSION + '/BIOGRID-ALL-' + VERSION + '.mitab.zip'

    # Files:
    folder = Folder()
    folder.get([tab2url, mitaburl])

    tab2 = folder.contains('tab2')[0].parse(printing=False, seperator=None)
    mitab = folder.contains('mitab')[0].parse(printing=False, seperator=None)

    # Parsing:
    header = tab2[0].split('\t')
    D = {}
    output = open(os.path.join(path, 'interactions.txt'), 'w')

    for x in xrange(0, len(tab2)):
        line = tab2[x]
        if "#BioGRID Interaction ID" not in line and line != "":
            columns = line.split('\t')
            if int(columns[0]) not in D: D[int(columns[0])] = {}
            systematic_name_intactor_a = columns[5]
            systematic_name_intactor_b = columns[6]
            official_gene_symbol_a = columns[7]
            official_gene_symbol_b = columns[8]
            synonymns_interactor_a = columns[9].split('|')
            synonymns_interactor_b = columns[10].split('|')

            AliasA, AliasB = [columns[1]], [columns[2]]

            if systematic_name_intactor_a != '-' and systematic_name_intactor_a not in AliasA:
                AliasA.append(systematic_name_intactor_a)
            if systematic_name_intactor_b != '-' and systematic_name_intactor_b not in AliasB:
                AliasB.append(systematic_name_intactor_b)
            if official_gene_symbol_a != '-' and official_gene_symbol_a not in AliasA:
                AliasA.append(official_gene_symbol_a)
            if official_gene_symbol_b != '-' and official_gene_symbol_b not in AliasB:
                AliasB.append(official_gene_symbol_b)
            for i in synonymns_interactor_a:
                if i != "-" and i not in AliasA: AliasA.append(i)
            for i in synonymns_interactor_b:
                if i != "-" and i not in AliasB: AliasB.append(i)

            experimental_system_type = [columns[12], 'direct']
            experimental_system = columns[11]

            type = mitab[x].split('\t')[11].split('(')[1].split(')')[0]

            pmid = int(columns[14])
            taxid_a = int(columns[15])
            if taxid_a == 559292: taxid_a = 4932
            taxid_b = int(columns[16])
            if taxid_b == 559292: taxid_b = 4932
            throughput = columns[17].split('|')
            if columns[19] != '-':
                modification = columns[19]
                #print modification
            else:
                modification = ''
            source = columns[23]  #'BioGRID'

            r = '\t'.join([
                '; '.join(AliasA), '; '.join(AliasB),
                '; '.join(experimental_system_type), experimental_system, type,
                modification,
                str(taxid_a),
                str(taxid_b),
                str(pmid), source + '\n'
            ])
            output.write(r)

            D[int(columns[0])][int(columns[1])] = {
                'source': source,
                'experimental_system': experimental_system,
                'experimental_system_type': experimental_system_type,
                'taxid_a': taxid_a,
                'taxid_b': taxid_b,
                'pmid': pmid,
                'throughput': throughput
            }
    output.close()
    new_entry = '\n%s %s' % (VERSION, datetime.datetime.now())
    version_file = open(os.path.join(path, 'version.txt'), 'a')
    version_file.write(new_entry)
    version_file.close()

    #Cleaning up:
    filenames = os.listdir(path)
    for filename in filenames:
        if "BIOGRID" in filename:
            print("Deleting %s" % filename)
            os.remove(os.path.join(filename))