def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info("Reading gene data...") genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info("Parsing StringTie output...") logger.info("Associating StringTie gene IDs with gene symbols...") stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": continue assert len(l) == 9 if l[2] != "transcript": continue attr = parse_attributes(l[8]) try: ref_gene = attr["ref_gene_name"] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr["gene_id"]] except KeyError: stringtie_genes[attr["gene_id"]] = {ref_gene} else: g.add(ref_gene) logger.info("Associated %d gene IDs with gene symbols.", len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info( "%d / %d associated with multiple gene symbols (%.1f%%).", n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes))), ) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect="excel-tab") for l in reader: if l[0][0] == "#": # skip header continue assert len(l) == 9 if l[2] != "transcript": # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr["FPKM"]) try: g = attr["ref_gene_name"] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr["gene_id"]] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr["TPM"]) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info("Ignored %.1f / %.1f FPKM (%.1f%%)", ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info( "Ignored %.1f FPKM from novel transcripts (%.1f%%).", fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm), ) else: if fpkm_novel_gene > 0: logger.info( "Ignored %.1f FPKM from transcripts of novel genes " "(%.1f%%).", fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm), ) if fpkm_ambig > 0: logger.info( "Ignored %.1f FPKM from transcripts with ambiguous " "gene membership (%.1f%%).", fpkm_ambig, 100 * (fpkm_ambig / total_fpkm), ) if fpkm_unknown_gene_name > 0: logger.info( "Ignored %.1f FPKM from transcripts of genes with unknown " "names (%.1f%%).", fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm), ) # write output file E = np.c_[fpkm, tpm] with open(output_file, "w") as ofh: writer = csv.writer(ofh, dialect="excel-tab", lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ["%.5f" % e for e in E[i, :]]) return 0
def main(args=None): """Extracts gene-level expression data from StringTie output. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() stringtie_file = args.stringtie_file gene_file = args.gene_file no_novel_transcripts = args.no_novel_transcripts output_file = args.output_file log_file = args.log_file quiet = args.quiet verbose = args.verbose logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read list of gene symbols logger.info('Reading gene data...') genes = misc.read_single(gene_file) # read StringTie output file and summarize FPKM and TPM per gene logger.info('Parsing StringTie output...') logger.info('Associating StringTie gene IDs with gene symbols...') stringtie_genes = {} with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': continue assert len(l) == 9 if l[2] != 'transcript': continue attr = parse_attributes(l[8]) try: ref_gene = attr['ref_gene_name'] except KeyError: continue else: # entry has a "ref_gene_name" attribute try: g = stringtie_genes[attr['gene_id']] except KeyError: stringtie_genes[attr['gene_id']] = { ref_gene, } else: g.add(ref_gene) logger.info('Associated %d gene IDs with gene symbols.', len(stringtie_genes)) # C = Counter(len(v) for v in stringtie_genes.itervalues()) gene_ids_ambiguous = [k for k, v in stringtie_genes.items() if len(v) > 1] n = len(gene_ids_ambiguous) logger.info('%d / %d associated with multiple gene symbols (%.1f%%).', n, len(stringtie_genes), 100 * (n / float(len(stringtie_genes)))) # read StringTie output file and summarize FPKM and TPM per gene n = len(genes) fpkm = np.zeros(n, dtype=np.float64) tpm = np.zeros(n, dtype=np.float64) fpkm_novel_gene = 0 fpkm_unknown_gene_name = 0 fpkm_novel_trans = 0 fpkm_ambig = 0 with open(stringtie_file) as fh: reader = csv.reader(fh, dialect='excel-tab') for l in reader: if l[0][0] == '#': # skip header continue assert len(l) == 9 if l[2] != 'transcript': # skip exon lines continue attr = parse_attributes(l[8]) f = float(attr['FPKM']) try: g = attr['ref_gene_name'] except KeyError: if no_novel_transcripts: # ignore this transcript fpkm_novel_trans += f continue else: # see if we can assign a gene name based on the gene ID try: assoc = stringtie_genes[attr['gene_id']] except KeyError: # gene_id not associated with any reference gene fpkm_novel_gene += f continue else: if len(assoc) > 1: # gene ID associated with multiple ref. genes # => ingored fpkm_ambig += f continue else: # gene ID associated with exactly one ref. gene g = list(assoc)[0] try: idx = misc.bisect_index(genes, g) except ValueError: fpkm_unknown_gene_name += f logger.warning('Unknown gene name: "%s".', g) continue t = float(attr['TPM']) fpkm[idx] += f tpm[idx] += t # ignored_fpkm = None if no_novel_transcripts: ignored_fpkm = fpkm_novel_trans + fpkm_unknown_gene_name else: ignored_fpkm = fpkm_novel_gene + fpkm_ambig + fpkm_unknown_gene_name total_fpkm = np.sum(fpkm) + ignored_fpkm logger.info('Ignored %.1f / %.1f FPKM (%.1f%%)', ignored_fpkm, total_fpkm, 100 * (ignored_fpkm / total_fpkm)) if no_novel_transcripts and fpkm_novel_trans > 0: logger.info('Ignored %.1f FPKM from novel transcripts (%.1f%%).', fpkm_novel_trans, 100 * (fpkm_novel_trans / total_fpkm)) else: if fpkm_novel_gene > 0: logger.info( 'Ignored %.1f FPKM from transcripts of novel genes ' '(%.1f%%).', fpkm_novel_gene, 100 * (fpkm_novel_gene / total_fpkm)) if fpkm_ambig > 0: logger.info( 'Ignored %.1f FPKM from transcripts with ambiguous ' 'gene membership (%.1f%%).', fpkm_ambig, 100 * (fpkm_ambig / total_fpkm)) if fpkm_unknown_gene_name > 0: logger.info( 'Ignored %.1f FPKM from transcripts of genes with unknown ' 'names (%.1f%%).', fpkm_unknown_gene_name, 100 * (fpkm_unknown_gene_name / total_fpkm)) # write output file E = np.c_[fpkm, tpm] with open(output_file, 'w') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for i, g in enumerate(genes): writer.writerow([g] + ['%.5f' % e for e in E[i, :]]) return 0
def main(args=None): """Download all .sra from NCBI SRA for a given experiment ID. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). """ if args is None: # parse command-line arguments parser = get_argument_parser() args = parser.parse_args() experiment_file = args.experiment_file output_file = args.output_file # log_file = args.log_file # quiet = args.quiet # verbose = args.verbose # logger = misc.get_logger(log_file=log_file, quiet=quiet, # verbose=verbose) host = 'ftp-trace.ncbi.nlm.nih.gov' user = '******' password = '******' # output_dir = download_dir + experiment_id + '/' # make sure output directory exists # misc.make_sure_dir_exists(output_dir) # logger.info('Created output directory: "%s".', output_dir) experiments = misc.read_single(experiment_file) runs = [] with ftputil.FTPHost(host, user, password) as ftp_host: for exp in experiments: exp_dir = '/sra/sra-instant/reads/ByExp/sra/SRX/%s/%s/' \ % (exp[:6], exp) ftp_host.chdir(exp_dir) run_folders = ftp_host.listdir(ftp_host.curdir) # logging.info('Found %d run folders.',len(run_folders)) for folder in run_folders: files = ftp_host.listdir(folder) assert len(files) == 1 runs.append((exp, folder)) with open(output_file, 'wb') as ofh: writer = csv.writer(ofh, dialect='excel-tab', lineterminator=os.linesep, quoting=csv.QUOTE_NONE) for r in runs: writer.writerow(r) return 0