コード例 #1
0
    def parse(self):
        # parse gene_info to represent the current valid entrez gene ids
        eg_dict = dict([((EntrezGeneParser.resourceLocation, rec[1]), (self.encoding[rec[9]], uuid.uuid4())) \
                        for rec in csv.reader(gzip_to_text(self.gene_info), delimiter="\t", quotechar="\"") if rec[0] in ("9606", "10090", "10116")])

        # parse gene_history to know which gene ids were discontinued
        history_dict = dict([(rec[2], rec[1])
                        for rec in csv.reader(gzip_to_text(self.gene_history), delimiter="\t", quotechar="\"") if rec[0] in ("9606", "10090", "10116")])
        history_dict = {k : self.__walk__(history_dict, k) for k in history_dict}
        return (eg_dict, history_dict)
コード例 #2
0
    def parse(self):

        # would like to have DictReader handle this, but need a way to
        # deal with the special case of the first value beginning with
        # a hashtag. i.e. #Format: <-- is NOT a column header.
        column_headers = [
            "tax_id",
            "GeneID",
            "status",
            "RNA nucleotide accession.version",
            "RNA nucleotide gi",
            "protein accession.version",
            "protein gi",
            "genomic nucleotide accession.version",
            "genomic nucleotide gi",
            "start position on the genomic accession",
            "end position on the genomic accession",
            "orientation",
            "assembly",
            "mature peptide accession.version",
            "mature peptide gi",
            "Symbol",
        ]

        g2a_reader = csv.DictReader(gzip_to_text(self._url), delimiter="\t", fieldnames=column_headers)

        for row in g2a_reader:
            if row["tax_id"] in ("9606", "10090", "10116"):
                yield row
コード例 #3
0
    def parse(self):

        column_headers = ['PubChem SID', 'Source', 'External ID', 'PubChem CID']
        cid_reader = csv.DictReader(gzip_to_text(self.cid_file), delimiter='\t',
                                    fieldnames=column_headers)

        for row in cid_reader:
            yield row
コード例 #4
0
    def parse(self):
        reader = csv.DictReader(gzip_to_text(self._url),
                                delimiter='\t',
                                fieldnames=self.headers)

        for row in reader:
            if row['tax_id'] in ('9606', '10090', '10116'):
                yield row
コード例 #5
0
	def parse(self):
		reader = csv.DictReader(gzip_to_text(self._url),
					   delimiter='\t',
					   fieldnames=self.headers)

		for row in reader:
			if row['tax_id'] in ('9606', '10090', '10116'):
				yield row
コード例 #6
0
    def parse(self):

        # use iso-8859-1 as default encoding.
        try:
            reader = csv.DictReader(gzip_to_text(self._url), delimiter="\t")
        except:
            reader = csv.DictReader(open(self._url, "r", encoding="iso-8859-1"), delimiter="\t")
        for row in reader:
            yield row
コード例 #7
0
    def parse(self):
        column_headers = ['pubchem_id', 'synonym']
        pub_reader = csv.DictReader(gzip_to_text(self.pub_file), delimiter='\t',
                                    fieldnames=column_headers)

        with open(self.pub_file, 'r') as fp:
            pub_reader = csv.DictReader(fp, delimiter='\t',
                                        fieldnames=column_headers)

        for row in pub_reader:
            yield row
コード例 #8
0
    def parse(self):

        # use iso-8859-1 as default encoding.
        try:
            reader = csv.DictReader(gzip_to_text(self._url), delimiter='\t')
        except:
            reader = csv.DictReader(open(self._url, "r",
                                         encoding="iso-8859-1"),
                                    delimiter='\t')
        for row in reader:
            yield row
コード例 #9
0
    def parse(self):

        entrez_history_headers = ["tax_id", "GeneID", "Discontinued_GeneID",
                                  "Discontinued_Symbol", "Discontinue_Date"]

        # dictionary for base gene info
        history_csvr = csv.DictReader(gzip_to_text(self.entrez_history),
                                      delimiter='\t',
                                      fieldnames=entrez_history_headers)

        for row in history_csvr:
            if row['tax_id'] in ("9606", "10090", "10116"):
                yield row
コード例 #10
0
    def parse(self):

        # columns for an Entrez gene info dataset.
        entrez_info_headers = ['tax_id', 'GeneID', 'Symbol', 'LocusTag',
                               'Synonyms', 'dbXrefs', 'chromosome',
                               'map_location', 'description',
                               'type_of_gene',
                               'Symbol_from_nomenclature_authority',
                               'Full_name_from_nomenclature_authority',
                               'Nomenclature_status',
                               'Other_designations', 'Modification_date']

        # dictionary for base gene info
        info_csvr = csv.DictReader(gzip_to_text(self.entrez_info),
                                   delimiter='\t',
                                   fieldnames=entrez_info_headers)

        for row in info_csvr:
            if row['tax_id'] in ('9606', '10090', '10116'):
                yield row
コード例 #11
0
    def parse(self):

        # would like to have DictReader handle this, but need a way to
        # deal with the special case of the first value beginning with
        # a hashtag. i.e. #Format: <-- is NOT a column header.
        column_headers = ['tax_id', 'GeneID', 'status',
                          'RNA nucleotide accession.version',
                          'RNA nucleotide gi', 'protein accession.version',
                          'protein gi', 'genomic nucleotide accession.version',
                          'genomic nucleotide gi',
                          'start position on the genomic accession',
                          'end position on the genomic accession',
                          'orientation', 'assembly',
                          'mature peptide accession.version',
                          'mature peptide gi', 'Symbol']

        g2a_reader = csv.DictReader(gzip_to_text(self.gene2acc_file), delimiter='\t',
                                    fieldnames=column_headers)

        for row in g2a_reader:
            yield row
コード例 #12
0
    def parse(self):
        reader = csv.DictReader(gzip_to_text(self._url), delimiter="\t", fieldnames=self.headers)

        for row in reader:
            if row["tax_id"] in ("9606", "10090", "10116"):
                yield row