Beispiel #1
0
def write_mapping_file(mapping_generator, outfile, confirm=True):
    """OUTPUT is mapping file:
    -------------------------
    Note: you will not know the source of the mapping unless you use
    the optional parameter "add_source=True" to merge_mapping() function
    col0: Ensembl gene ID
    col2 "add_source" == 1: NCBI ID gene ID from gene2ensembl
    col2 "add_source" == 2: NCBI ID gene ID from ncbi_list if symbol == ensembl symbol
        (i.e. iterate through ncbi list (for each Ensembl ID) on gene_info file
        and when the symbol found matches the ensembl symbol use this
        NCBI ID if symbols match only once)
    """
    print("step 6 start: write file from mapping generator of tuples")
    mapping_file, mapping_filename = safewfile(outfile,
                                               prompt=confirm,
                                               default='O')

    count = 0
    for item in mapping_generator:
        count += 1
        split_item = list(item)
        split_item = '\t'.join([str(i) for i in split_item])
        mapping_file.write(split_item + "\n")

    print("total Ensembl IDs uniquely mapped to NCBI gene ID:", count)
    mapping_file.close()
    print("Output file: \"{}\"".format(mapping_filename))
    print("step 6 end\n")
    return count
Beispiel #2
0
 def _fetch_data(self, outfile, attributes, filters='', header=None, debug=False):
     cnt_all = 0
     out_f, outfile = safewfile(outfile,prompt=False,default='O')
     if header:
         out_f.write('\t'.join(header) + '\n')
     for species in self.__class__.species_li:
         try:
             dataset = self.get_dataset_name(species)
         except IndexError:
             # bad dataset name, skip (this used to be catched in a try/finally
             # so it wasn't dealth with before)
             self.logger.debug("Skip species '%s'" % species)
             continue
         taxid = species[2]
         if not dataset:
             continue
         xml = self._make_query_xml(dataset, attributes=attributes, filters=filters)
         if debug:
             self.logger.info(xml)
         try:
             con = self.query_mart(xml)
         except MartException:
             import traceback
             err_msg = traceback.format_exc()
             self.logger.error("%s %s" % (species[0], err_msg))
             continue
         cnt = 0
         for line in con.split('\n'):
             if line.strip() != '':
                 out_f.write(str(taxid) + '\t' + line + '\n')
                 cnt += 1
                 cnt_all += 1
         self.logger.info("%s %s" % (species[0], cnt))
     out_f.close()
     self.logger.info("Total: %d" % cnt_all)
def write_mapping_file(mapping_generator, confirm=True):
    """OUTPUT is mapping file:
    -------------------------
    Note: you will not know the source of the mapping unless you use
    the optional parameter "add_source=True" to merge_mapping() function
    col0: Ensembl gene ID
    col2 "add_source" == 1: NCBI ID gene ID from gene2ensembl
    col2 "add_source" == 2: NCBI ID gene ID from ncbi_list if mygene.info symbol == ensembl symbol
        (i.e. iterate through ncbi list (for each Ensembl ID) on mygene.info
        (ex: http://mygene.info/v2/gene/100894237?fields=symbol )
        and when the symbol found matches the ensembl symbol use this
        NCBI ID if symbols match only once)
    """
    print("step 6 start: write file from mapping generator of tuples")
    mapping_file, mapping_filename = safewfile(outfile, prompt=confirm,default='O')

    count = 0
    for item in mapping_generator:
        count += 1
        split_item = list(item)
        split_item = '\t'.join([str(i) for i in split_item])
        mapping_file.write(split_item + "\n")

    print("total Ensembl IDs uniquely mapped to NCBI gene ID:", count)
    mapping_file.close()
    print("Output file: \"{}\"".format(mapping_filename))
    print("step 6 end\n")
    return count
def file_merge(infiles, outfile=None, header=1, verbose=1):
    '''merge a list of input files with the same format.
       if header will be removed from the 2nd files in the list.
    '''
    outfile = outfile or '_merged'.join(os.path.splitext(infiles[0]))
    out_f, outfile = safewfile(outfile)
    if verbose:
        print("Merging...")
    cnt = 0
    for i, fn in enumerate(infiles):
        print(os.path.split(fn)[1], '...', end='')
        line_no = 0
        in_f = anyfile(fn)
        if i > 0:
            for k in range(header):
                in_f.readline()
        for line in in_f:
            out_f.write(line)
            line_no += 1
        in_f.close()
        cnt += line_no
        print(line_no)
    out_f.close()
    print("=" * 20)
    print("Done![total %d lines output]" % cnt)
Beispiel #5
0
def file_merge(infiles, outfile=None, header=1, verbose=1):
    '''merge a list of input files with the same format.
       if header will be removed from the 2nd files in the list.
    '''
    outfile = outfile or '_merged'.join(os.path.splitext(infiles[0]))
    out_f, outfile = safewfile(outfile)
    if verbose:
        print("Merging...")
    cnt = 0
    for i, fn in enumerate(infiles):
        print(os.path.split(fn)[1], '...', end='')
        line_no = 0
        in_f = anyfile(fn)
        if i > 0:
            for k in range(header):
                in_f.readline()
        for line in in_f:
            out_f.write(line)
            line_no += 1
        in_f.close()
        cnt += line_no
        print(line_no)
    out_f.close()
    print("=" * 20)
    print("Done![total %d lines output]" % cnt)
Beispiel #6
0
 def _fetch_data(self, outfile, attributes, filters='', header=None, debug=False):
     cnt_all = 0
     out_f, outfile = safewfile(outfile, prompt=(not self.no_confirm), default='O')
     if header:
         out_f.write('\t'.join(header) + '\n')
     logging.info('Dumping "%s"...' % os.path.split(outfile)[1])
     for species in self.species_li:
         dataset = self.get_dataset_name(species)
         taxid = species[2]
         if not dataset:
             continue
         xml = self._make_query_xml(dataset, attributes=attributes, filters=filters)
         if debug:
             logging.info(xml)
         try:
             con = self.query_mart(xml)
         except MartException:
             import traceback
             err_msg = traceback.format_exc()
             logging.error("%s %s" % (species[0], err_msg))
             continue
         cnt = 0
         for line in con.split('\n'):
             if line.strip() != '':
                 out_f.write(str(taxid) + '\t' + line + '\n')
                 cnt += 1
                 cnt_all += 1
         logging.info("%s %s" % (species[0], cnt))
     out_f.close()
     logging.info("Total: %d" % cnt_all)
Beispiel #7
0
def dispatch(src):
    src_doc = src_dump.find_one({'_id': src})
    datadump_logfile = src_doc.get('logfile', '')
    if datadump_logfile:
        upload_logfile = os.path.join(os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src))
    else:
        from config import DATA_ARCHIVE_ROOT
        upload_logfile = os.path.join(DATA_ARCHIVE_ROOT, '{}_upload.log'.format(src))

    log_f, logfile = safewfile(upload_logfile, prompt=False, default='O')
    p = Popen(['python', '-u', '-m', 'dataload.start', src],
              stdout=log_f, stderr=STDOUT, cwd=src_path)
    p.logfile = logfile
    p.log_f = log_f
    return p
Beispiel #8
0
def dispatch(src):
    src_doc = src_dump.find_one({'_id': src})
    datadump_logfile = src_doc.get('logfile', '')
    if datadump_logfile:
        upload_logfile = os.path.join(
            os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src))
    else:
        from config import DATA_ARCHIVE_ROOT
        upload_logfile = os.path.join(DATA_ARCHIVE_ROOT,
                                      '{}_upload.log'.format(src))

    log_f, logfile = safewfile(upload_logfile, prompt=False, default='O')
    p = Popen(['python', '-u', '-m', 'dataload.start', src],
              stdout=log_f,
              stderr=STDOUT,
              cwd=src_path)
    p.logfile = logfile
    p.log_f = log_f
    return p
Beispiel #9
0
 def _fetch_data(self,
                 outfile,
                 attributes,
                 filters='',
                 header=None,
                 debug=False):
     cnt_all = 0
     out_f, outfile = safewfile(outfile,
                                prompt=(not self.no_confirm),
                                default='O')
     if header:
         out_f.write('\t'.join(header) + '\n')
     logging.info('Dumping "%s"...' % os.path.split(outfile)[1])
     for species in self.species_li:
         dataset = self.get_dataset_name(species)
         taxid = species[2]
         if not dataset:
             continue
         xml = self._make_query_xml(dataset,
                                    attributes=attributes,
                                    filters=filters)
         if debug:
             logging.info(xml)
         try:
             con = self.query_mart(xml)
         except MartException:
             import traceback
             err_msg = traceback.format_exc()
             logging.error("%s %s" % (species[0], err_msg))
             continue
         cnt = 0
         for line in con.split('\n'):
             if line.strip() != '':
                 out_f.write(str(taxid) + '\t' + line + '\n')
                 cnt += 1
                 cnt_all += 1
         logging.info("%s %s" % (species[0], cnt))
     out_f.close()
     logging.info("Total: %d" % cnt_all)
Beispiel #10
0
 def _fetch_data(self, outfile, attributes, filters='', header=None):
     cnt_lines_all = 0
     cnt_species_success = 0
     out_f, outfile = safewfile(outfile, prompt=False, default='O')
     if header:
         out_f.write('\t'.join(header) + '\n')
     for count, species in enumerate(self.species_li):
         try:
             dataset = self.get_dataset_name(species)
         except IndexError:
             self.logger.debug("Skip species '%s'", species)
             continue
         if not dataset:
             continue
         taxid = species[2]
         xml = self._make_query_xml(
             dataset, attributes=attributes, filters=filters)
         try:
             con = self.query_mart(xml)
         except EntrezgeneNotFound as err:
             if 'xref_entrezgene' in outfile:
                 cnt_species_success += 1
                 self.logger.warning("%s:: %s: %s", os.path.basename(outfile),
                                     species[0], 'Skipping species without entrez gene id')
             else:
                 self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err)
             continue
         except GeneNameNotFound as err:
             _attributes = attributes.copy()
             _attr_ext_gene_index = attributes.index('external_gene_name')
             _attributes.remove('external_gene_name')
             self.logger.debug(_attributes)
             _xml = self._make_query_xml(
                 dataset, attributes=_attributes, filters=filters)
             try:
                 con = self.query_mart(_xml)
             except MartException as err:
                 self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err)
             self.logger.warning("%s:: %s: %s", os.path.basename(outfile), species[0],
                                 'Retried to request species without external gene name')
             cnt_lines = 0
             cnt_species_success += 1
             for line in con.split('\n'):
                 if line.strip() != '':
                     tsv = line.split('\t')
                     out_f.write(str(taxid) + '\t' +
                                 tsv[0] + '\t\t' + '\t'.join(tsv[1:]) + '\n')
                     cnt_lines += 1
                     cnt_lines_all += 1
             self.logger.info("%s:: %d/%d %s %d records", os.path.basename(outfile),
                              count + 1, len(self.species_li), species[0], cnt_lines)
             continue
         except MartException as err:
             self.logger.error("%s:: %s %s", os.path.basename(outfile), species[0], err)
             continue
         cnt_lines = 0
         cnt_species_success += 1
         if not con:
             self.logger.error('Empty Response.')
         for line in con.split('\n'):
             if line.strip() != '':
                 out_f.write(str(taxid) + '\t' + line + '\n')
                 cnt_lines += 1
                 cnt_lines_all += 1
         self.logger.info("%s:: %d/%d %s %d records", os.path.basename(outfile),
                          count + 1, len(self.species_li), species[0], cnt_lines)
     out_f.close()
     self.logger.info("Total: %s:: %d/%d successes %d records", os.path.basename(outfile),
                      cnt_species_success, len(self.species_li), cnt_lines_all)