def intronic(inputfile=None, outputfile=None, names='transcript_id', separator="_", intron_nb_in_name=False, no_feature_name=False, by_transcript=False): """ Extract intronic regions. """ message("Searching for intronic regions.") # Need to load if the gtf comes from # <stdin> gtf = GTF(inputfile, check_ensembl_format=False) if not by_transcript: introns_bo = gtf.get_introns() for i in introns_bo: write_properly(chomp(str(i)), outputfile) else: introns_bo = gtf.get_introns(by_transcript=True, name=names.split(","), sep=separator, intron_nb_in_name=intron_nb_in_name, feat_name=not no_feature_name) for i in introns_bo: write_properly(chomp(str(i)), outputfile) gc.disable() close_properly(outputfile, inputfile)
def bed_to_gtf(inputfile=None, outputfile=None, ft_type="transcript", source="Unknown"): """ Convert a bed file to a gtf. This will make the poor bed feel as if it was a nice gtf (but with lots of empty fields...). May be helpful sometimes... """ message("Converting the bed file into GTF file.") if inputfile.name == '<stdin>': tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed") for i in inputfile: write_properly(chomp(str(i)), tmp_file) tmp_file.close() inputfile.close() bed_obj = BedTool(tmp_file.name) else: bed_obj = BedTool(inputfile.name) n = 1 for i in bed_obj: if i.strand == "": i.strand = "." if i.name == "": i.name = str("feature_" + str(n)) if i.score == "": i.score = "0" if ft_type == "exon": key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\"; " + \ "exon_id \"" + i.name + "\";" elif ft_type == "gene": key_value = "gene_id \"" + i.name + "\";" else: key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\";" if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + i.chrom else: chrom_out = i.chrom list_out = [ chrom_out, source, ft_type, str(i.start + 1), str(i.end), str(i.score), i.strand, ".", key_value ] write_properly("\t".join(list_out), outputfile) n += 1 gc.disable() close_properly(outputfile)
def __call__(self, parser, namespace, values, option_string=None): from pandas import __version__ as pandas_ver from pybedtools import __version__ as pybedtools_ver from pyBigWig import __version__ as bigwig_ver from pygtftk import __path__ as pygtftk_path from pygtftk.cmd_manager import CmdManager import subprocess from pygtftk.utils import chomp info_sys = [] info_sys += ['\n- pygtftk version : ' + __version__] info_sys += ['- pygtftk installation path : ' + pygtftk_path[0]] info_sys += ['- pygtftk config directory : ' + CmdManager.config_dir] info_sys += [ '- pygtftk personal plugins : ' + os.path.join(CmdManager.config_dir, 'plugins') ] info_sys += ['- python version : ' + str(sys.version_info)] info_sys += ['- python path : ' + str(sys.prefix)] info_sys += ['- pandas version : ' + pandas_ver] bedtools_ver = chomp( subprocess.Popen("bedtools --version", shell=True, stdout=subprocess.PIPE).stdout.read().decode()) info_sys += ['- Bedtools version : ' + bedtools_ver] info_sys += ['- pybedtools version : ' + pybedtools_ver] info_sys += ['- pyBigWig version : ' + bigwig_ver] info_sys += ['- uname : ' + str(os.uname())] print("\n".join(info_sys)) sys.exit()
def midpoints(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|"): """ Get the midpoint coordinates for the requested feature. """ message("Loading input file...") if inputfile.name == '<stdin>': is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': is_gtf = True else: is_gtf = False if is_gtf: gtf = GTF(inputfile.name, check_ensembl_format=False) bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints( name=names.split(","), sep=separator) for line in bed_obj: write_properly(chomp(str(line)), outputfile) else: for line in region_bo: diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 outputfile.write(str(line)) gc.disable() close_properly(outputfile, inputfile)
def intergenic(inputfile=None, outputfile=None, chrom_info=None): """ Extract intergenic regions. """ message("Searching for intergenic regions.") gtf = GTF(inputfile) intergenic_regions = gtf.get_intergenic(chrom_info) nb_intergenic_region = 1 for i in intergenic_regions: i.name = "region_" + str(nb_intergenic_region) write_properly(chomp(str(i)), outputfile) nb_intergenic_region += 1 gc.disable() close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def col_from_tab(inputfile=None, outputfile=None, columns=None, invert_match=False, no_header=False, unique=False, more_col=None, output_separator="\t", separator="\t"): """Select columns from a tabulated file based on their names.""" line_set = dict() if re.search(",", columns): columns = columns.split(",") else: columns = [columns] if more_col: more_col_name, more_col_value = more_col.split(":") else: more_col_name = more_col_value = None for p, line in enumerate(inputfile): line = chomp(line) line = line.split(separator) if p == 0: if not invert_match: pos_list = list() for i in range(len(columns)): pos = line.index(columns[i]) if columns[i] in line else -1 if pos > -1: pos_list.append(pos) else: message("Column " + columns[i] + " not found", type="ERROR") else: pos_list = list(range(len(line))) for i in range(len(columns)): pos = line.index(columns[i]) if columns[i] in line else -1 if pos > -1: pos_list.remove(pos) else: message("Column " + columns[i] + " not found", type="ERROR") if not no_header: header_list = [line[k] for k in pos_list] if more_col: header_list += [more_col_name] header = output_separator.join(header_list) write_properly(header, outputfile) else: out_list = [line[k] for k in pos_list] if more_col: out_list += [more_col_value] out = output_separator.join(out_list) if unique: if out not in line_set: write_properly(out, outputfile) line_set[out] = 1 else: write_properly(out, outputfile)
def control_list(in_file=None, out_dir=None, reference_gene_file=None, log2=False, page_width=None, page_height=None, user_img_file=None, page_format=None, pseudo_count=1, set_colors=None, dpi=300, rug=False, jitter=False, skip_first=False): # ------------------------------------------------------------------------- # # Check in_file content # # ------------------------------------------------------------------------- for p, line in enumerate(in_file): line = chomp(line) line = line.split("\t") if len(line) > 2: message("Need a two columns file.", type="ERROR") if skip_first: if p == 0: continue try: fl = float(line[1]) except ValueError: msg = "It seems that column 2 of input file" msg += " contains non numeric values. " msg += "Check that no header is present and that " msg += "columns are ordered properly. " msg += "Or use '--skip-first'. " message(msg, type="ERROR") if log2: fl = fl + pseudo_count if fl <= 0: message("Can not log transform negative/zero values. Add a pseudo-count.", type="ERROR") # ------------------------------------------------------------------------- # # Check colors # # ------------------------------------------------------------------------- set_colors = set_colors.split(",") if len(set_colors) != 2: message("Need two colors. Please fix.", type="ERROR") mcolors_name = mcolors.cnames for i in set_colors: if i not in mcolors_name: if not is_hex_color(i): message(i + " is not a valid color. Please fix.", type="ERROR") # ------------------------------------------------------------------------- # # Preparing output files # # ------------------------------------------------------------------------- # Preparing pdf file name file_out_list = make_outdir_and_file(out_dir, ["control_list.txt", "reference_list.txt", "diagnostic_diagrams." + page_format], force=True) control_file, reference_file_out, img_file = file_out_list if user_img_file is not None: os.unlink(img_file.name) img_file = user_img_file if not img_file.name.endswith(page_format): msg = "Image format should be: {f}. Please fix.".format(f=page_format) message(msg, type="ERROR") test_path = os.path.abspath(img_file.name) test_path = os.path.dirname(test_path) if not os.path.exists(test_path): os.makedirs(test_path) # ------------------------------------------------------------------------- # # Read the reference list # # ------------------------------------------------------------------------- try: reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None) except pd.errors.EmptyDataError: message("No genes in --reference-gene-file.", type="ERROR") reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True) # ------------------------------------------------------------------------- # # Delete duplicates # # ------------------------------------------------------------------------- before = len(reference_genes) reference_genes = reference_genes.drop_duplicates(['gene']) after = len(reference_genes) msg = "%d duplicate lines have been deleted in reference file." message(msg % (before - after)) # ------------------------------------------------------------------------- # # Read expression data and add the pseudo_count # # ------------------------------------------------------------------------- if skip_first: exp_data = pd.read_csv(in_file.name, sep="\t", header=None, index_col=None, skiprows=[0], names=['exprs']) else: exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0) exp_data.exprs = exp_data.exprs.values + pseudo_count # ------------------------------------------------------------------------- # # log transformation # # ------------------------------------------------------------------------- ylabel = 'Expression' if log2: if len(exp_data.exprs.values[exp_data.exprs.values == 0]): message("Can't use log transformation on zero or negative values. Use -p.", type="ERROR") else: exp_data.exprs = np.log2(exp_data.exprs.values) ylabel = 'log2(Expression)' # ------------------------------------------------------------------------- # # Are reference gene found in control list # # ------------------------------------------------------------------------- # Sort in increasing order exp_data = exp_data.sort_values('exprs') # Vector with positions indicating which in the # expression data list are found in reference_gene reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index] msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found) message(msg) not_found = [x for x in reference_genes['gene'] if x not in exp_data.index] if len(not_found): if len(not_found) == len(reference_genes): message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR") else: message("List of reference genes not found :%s" % not_found) else: message("All reference genes were found.") # ------------------------------------------------------------------------- # # Search for genes with matched signal # # ------------------------------------------------------------------------- exp_data_save = exp_data.copy() control_list = list() nb_candidate_left = exp_data.shape[0] - len(reference_genes_found) message("Searching for genes with matched signal.") if nb_candidate_left < len(reference_genes_found): message("Not enough element to perform selection. Exiting", type="ERROR") for i in reference_genes_found: not_candidates = reference_genes_found + control_list not_candidates = list(set(not_candidates)) diff = abs(exp_data.loc[i] - exp_data) control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist()) # ------------------------------------------------------------------------- # # Prepare a dataframe for plotting # # ------------------------------------------------------------------------- message("Preparing a dataframe for plotting.") reference = exp_data_save.loc[reference_genes_found].sort_values('exprs') reference = reference.assign(genesets=['Reference'] * reference.shape[0]) control = exp_data_save.loc[control_list].sort_values('exprs') control = control.assign(genesets=['Control'] * control.shape[0]) data = pd.concat([reference, control]) data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index) data['genesets'] = Categorical(data['genesets']) # ------------------------------------------------------------------------- # # Diagnostic plots # # ------------------------------------------------------------------------- p = ggplot(data, aes(x='sets', y='exprs', fill='genesets')) p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors))) p += geom_violin(color=None) p += xlab('Gene sets') + ylab(ylabel) p += facet_wrap('~genesets') if rug: p += geom_rug() if jitter: p += geom_jitter() p += theme_bw() p += theme(axis_text_x=element_blank()) # ------------------------------------------------------------------------- # Turn warning off. Both pandas and plotnine use warnings for deprecated # functions. I need to turn they off although I'm not really satisfied with # this solution... # ------------------------------------------------------------------------- def fxn(): warnings.warn("deprecated", DeprecationWarning) # ------------------------------------------------------------------------- # # Saving # # ------------------------------------------------------------------------- with warnings.catch_warnings(): warnings.simplefilter("ignore") fxn() message("Saving diagram to file : " + img_file.name) message("Be patient. This may be long for large datasets.") try: p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False) except PlotnineError as err: message("Plotnine message: " + err.message) message("Plotnine encountered an error.", type="ERROR") # ------------------------------------------------------------------------- # # write results # # ------------------------------------------------------------------------- exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t") exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
def get_5p_3p_coords(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|", more_names='', transpose=0, invert=False, explicit=False): """ Get the 5p or 3p coordinate for each feature (e.g TSS or TTS for a transcript). """ if more_names is None: more_names = [] else: more_names = more_names.split(',') if not invert: message("Computing 5' coordinates of '" + ft_type + "'.") else: message("Computing 3' coordinates of '" + ft_type + "'.") gtf = GTF(inputfile, check_ensembl_format=False) if names != "*": nms = names.split(",") else: nms = gtf.select_by_key("feature", "transcript").get_attr_list(add_basic=False) if not invert: bed_obj = gtf.get_5p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) else: bed_obj = gtf.get_3p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) if not len(bed_obj): message("Requested feature could not be found. Use convert_ensembl maybe.", type="ERROR") if transpose == 0: for i in bed_obj: write_properly(chomp(str(i)), outputfile) else: for i in bed_obj: out_list = list() if i.strand == "+": out_list = [i.chrom, str(i.start + transpose), str(i.end + transpose), i.name, i.score, i.strand] elif i.strand == "-": out_list = [i.chrom, str(i.start - transpose), str(i.end - transpose), i.name, i.score, i.strand] outputfile.write("\t".join(out_list) + "\n") gc.disable() close_properly(outputfile, inputfile)