def main(): """Invoke when run directly as a program.""" args = parse_arguments() mappability = parse_mapability_file(args.mappability) expression = parse_expression_file(args.counts) missing_genes = expression.index.difference(mappability.index) if len(missing_genes) > 0: send_message( error("Feature ID {} is not present in the mappability file. " "Make sure that the expressions and mappability file are " "derived from the same annotations (GTF/GFF) file.".format( missing_genes[0]))) sys.exit(1) lib_size = expression.sum() result = 10**9 * expression / lib_size / mappability result[mappability == 0] = 0.0 result.loc[expression.index].to_csv( args.output, index_label="Gene", header=["Expression"], sep="\t", compression="gzip", )
def test_send_message(self): def _receive(server_socket, result): response = {'type_data': 'OK'} message_body = json.dumps(response).encode() message_header = "{length:0{size}d}".format( length=len(message_body), size=5 ).encode("utf-8") message = message_header + message_body connection = sock.accept()[0] received = b"" header_length = int(connection.recv(5)) received = connection.recv(header_length) connection.send(message) result.append(received) result = [] test_message = "Test data" temp_dir = tempfile.mkdtemp() try: socket_path = os.path.join(temp_dir, "socket.s") with patch("resolwe_runtime_utils.COMMUNICATOR_SOCKET", socket_path): sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(socket_path) sock.listen(1) server_thread = Thread(target=_receive, args=(sock, result)) server_thread.start() send_message(test_message) server_thread.join() finally: shutil.rmtree(temp_dir) self.assertEqual(test_message, json.loads(result[0].decode()))
def parse_expression_file(exp_file, exp_type): """Parse expression file to a Pandas dataframe.""" with gzip.open(exp_file) as exp: df = pd.read_csv(exp, sep="\t") ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"] if not all(column_label in ALLOWED_COLUMNS for column_label in df.columns.values): send_message( error("Invalid column headers {} in file {}.".format( df.columns.values, exp_file))) sys.exit(1) df.rename( index=str, columns={ "Gene": "FEATURE_ID", "Transcript": "FEATURE_ID", "Expression": exp_type, }, inplace=True, ) # Cast FEATURE_ID column to string df["FEATURE_ID"] = df["FEATURE_ID"].astype("str") # Remove any possible empty rows from the input file df.dropna(inplace=True) return df
def save_results(matched, notmatched, badquality, skipped, total, _progress): total = float(total) send_message( save( "matched", "{:,} reads ({:.2f} %)".format(matched, 100 * matched / total), )) send_message( save( "notmatched", "{:,} reads ({:.2f} %)".format(notmatched, 100 * notmatched / total), )) send_message( save( "badquality", "{:,} reads ({:.2f} %)".format(badquality, 100 * badquality / total), )) send_message( save( "skipped", "{:,} reads ({:.2f} %)".format(skipped, 100 * skipped / total), )) send_message(progress(_progress))
def validate_inputs(args): """Validate inputs.""" # Validate that all expression types are equal. exp_type_set = set(args.exp_types) if len(exp_type_set) != 1: msg = "All samples should have the same expression type, but multiple expression types were given: {}." msg = msg.format(", ".join(exp_type_set)) send_message(warning(msg)) # Validate that same number of sample names, expression files and # expression types are given. assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() amplicon_names = set() with open(args.master_file, newline="") as masterfile: reader = csv.reader(masterfile, delimiter="\t") for row in reader: if len(row) != 12: send_message( error( "Uploaded master file must contain exactly 12 columns." )) if not check_dna_sequence(row[10]): send_message(error("11th column must contain a DNA sequence.")) if not check_dna_sequence(row[11]): send_message(error("12th column must contain a DNA sequence.")) amp_name = row[3] if amp_name not in amplicon_names: amplicon_names.add(amp_name) else: send_message( error( "Amplicon names must be unique. Amplicon {} is seen multiple times." .format(amp_name)))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() validate_inputs(args) exp_type = args.exp_types[0] spikeins_mix = args.spikeins_mix expected = get_expected(spikeins_mix, log2=True) min_one_has_spikeins = False # At least one sample has spikeins = False warnings = [] for sample_name, sample_exp in zip(args.sample_names, args.sample_exps): measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True) measured_nonzero = get_measured( sample_exp, sample_name, exp_type, only_nonzero=True, log2=True ) merged_zero = merge_expected_measured(expected, measured_zero) merged_nonzero = merge_expected_measured(expected, measured_nonzero) # Get only ERCC spike-in's and plot the histogram-scatter figure. if merged_nonzero.iloc[merged_nonzero.index.str.startswith("ERCC"), :].empty: warnings.append( "All ERCC spike-ins have zero expression in sample {}".format( sample_name ) ) continue min_one_has_spikeins = True plot_histogram_scatter( expected=expected.iloc[expected.index.str.startswith("ERCC")], zero=merged_zero.iloc[merged_zero.index.str.startswith("ERCC"), :], nonzero=merged_nonzero.iloc[merged_nonzero.index.str.startswith("ERCC"), :], spikein_type="ERCC", sample_name=sample_name, exp_type=exp_type, ) if min_one_has_spikeins: for message in warnings: send_message(warning(message)) else: # In case all samples have zero expression for all spikeins, # rather print one warning that says so (instead of printing # warning for each of the samples). send_message(warning("All ERCC spike-ins in all samples have zero expression."))
def parse_mappings(species, infile, outfile): """Parse file with chromosome mappings.""" mappings = dict() # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file # with output name and warining if species not in MAPPINGS_FILES: msg = 'Chromosome mappings for Species "{}" are not supported.'.format( species) send_message(warning(msg)) os.rename(infile, outfile) sys.exit(0) for basename in MAPPINGS_FILES[species]: filename = os.path.join(MAPPINGS_DIR, basename) mappings.update(parse_mapping_file(filename)) return mappings
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]): """Compute PCA.""" if not gene_labels: gene_labels = expressions.index skipped_gene_labels = list(set(gene_labels).difference(expressions.index)) if expressions.shape[0] < 2 or expressions.shape[1] < 2: coordinates = [[0.0, 0.0] for i in range(expressions.shape[1])] all_components = [[], []] all_explained_variance_ratios = [0.0, 0.0] else: pca = PCA(n_components=n_components, whiten=True) pca_expressions = pca.fit_transform(expressions.transpose()) coordinates = [ t[:2].tolist() if len(t) > 1 else [t[0], 0.0] for t in pca_expressions ] all_components = [ component_top_factors(component, gene_labels) for component in pca.components_ ] if np.isnan(pca.explained_variance_ratio_).any(): all_explained_variance_ratios = [ 0.0 for _ in pca.explained_variance_ratio_ ] else: all_explained_variance_ratios = pca.explained_variance_ratio_.tolist( ) result = { "coordinates": coordinates, "all_components": all_components, "all_explained_variance_ratios": all_explained_variance_ratios, "skipped_gene_labels": skipped_gene_labels, "warning": None, } if expressions.empty: send_message( warning( "Gene selection and filtering resulted in no genes. Please select different samples or genes." )) return result
def main(): """Invoke when run directly as a program.""" args = parse_arguments() gene_sets = create_gene_sets(args.dge_file, args.logfc, args.fdr) fname_prefix = generate_name(args.analysis_name, args.tool, args.logfc, args.fdr) out_dir = Path(args.out_dir) if not out_dir.exists(): out_dir.mkdir() for name, data in gene_sets.items(): if data.empty: send_message( warning( f"No {name}-regulated genes. Gene set was not created.")) else: save_genes(data, out_dir / f"{fname_prefix}_{name}.tab.gz")
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.geneset_file, "rU") as infile: # skip empty lines in input gene set file genes = [str(line.strip()) for line in infile if line.strip()] geneset = sorted(set(genes)) if len(genes) != len(geneset): send_message(warning("Removed duplicated genes.")) with open(args.output_json, "w") as json_out: json.dump({"genes": geneset}, json_out, separators=(",", ":"), allow_nan=False) with gzip.open(args.output_file, "w") as file_out: file_out.write("\n".join(geneset).encode("utf-8"))
def parse_mapability_file(mapability_file): """Parse mapability file to a Pandas Series.""" try: mappability = pd.read_csv( mapability_file, sep="\t", usecols=["gene_id", "coverage"], index_col="gene_id", dtype={ "gene_id": str, "coverage": float, }, squeeze=True, ) return mappability.dropna() except (ValueError, OSError) as parse_error: send_message( error("Failed to read mappability file {}. {}".format( basename(mapability_file), parse_error))) sys.exit(1)
def create_new_header(infile, mappings, outfile): """Create new header in BigWig, with UCSC chromosome names.""" with pyBigWig.open(infile) as bw: if set(bw.chroms().keys()).issubset(mappings.values()): # If chromosome names are already UCSC, just rename input file to output name. # Exit with status 0 since this is normal behavior. os.rename(infile, outfile) sys.exit(0) hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings] if not hdr: msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done." send_message(warning(msg)) os.rename(infile, outfile) sys.exit(0) seq_num = 0 with pyBigWig.open(outfile, "w") as bw_output: bw_output.addHeader(hdr) for chrom, length in bw.chroms().items(): ints = bw.intervals(chrom, 0, length) if ints and chrom in mappings: bw_output.addEntries( [mappings[chrom]] * len(ints), [x[0] for x in ints], ends=[x[1] for x in ints], values=[x[2] for x in ints], ) elif chrom not in mappings: seq_num += 1 print("UCSC chromosome/conting mapping for {} is missing". format(chrom)) if seq_num > 0: send_message( warning( "UCSC chromosome/conting mapping for {} sequence(s) is missing. " "This sequence(s) will not be included in the bigWig file." .format(seq_num)))
def parse_expression_file(exp_file): """Parse expression file to a Pandas Series.""" try: expression = pd.read_csv( exp_file, sep="\t", compression="gzip", usecols=["Gene", "Expression"], index_col="Gene", dtype={ "Gene": str, "Expression": float, }, squeeze=True, ) return expression.dropna() except (ValueError, OSError) as parse_error: send_message( error("Failed to read input file {}. {}".format( basename(exp_file), parse_error))) sys.exit(1)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() with open(args.input_file) as infile: data = json.load(infile) if "expected_format" in data and "compatible_fragment_ratio" in data: send_message(save("strandedness", data["expected_format"])) send_message( save("fragment_ratio", str(round(data["compatible_fragment_ratio"], 2)))) else: send_message(error("Cannot parse library type information file."))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() if args.norm_expressions and args.norm_expressions_type: if len(args.norm_expressions) != len(args.norm_expressions_type): send_message( error( "The number of additional expression files must match the number of specified " "expressions types.")) sys.exit(1) if args.norm_expressions_type: exp_types = [args.expressions_type] + args.norm_expressions_type if len(exp_types) != len(set(exp_types)): send_message( error( "The union of the main expression type ({}) and additional normalized expression types {} " "does not contain unique items.".format( args.expressions_type, args.norm_expressions_type))) sys.exit(1) res = resdk.Resolwe() feature_dict = {} df = parse_expression_file(args.expressions, args.expressions_type) # Get a list of feature IDs input_features = df["FEATURE_ID"].tolist() # Split feature IDs into chunks with max size of 10000 elements features_sublists = [ input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE) ] # Fetch features from KB and add them to {feature_id: feature_name} mapping dict for fsublist in features_sublists: features = res.feature.filter(source=args.source_db, species=args.species, feature_id__in=fsublist) feature_dict.update({f.feature_id: f.name for f in features}) # Map gene symbols to feature IDs df["GENE_SYMBOL"] = df["FEATURE_ID"].map(feature_dict) # Check if all of the input feature IDs could be mapped to the gene symbols if not all(f_id in feature_dict for f_id in input_features): send_message( warning( "{} feature(s) could not be mapped to the associated feature symbols." .format(sum(df.isnull().values.ravel())))) # Merge additional expression files with the original data frame if args.norm_expressions and args.norm_expressions_type: for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type): exp_df = parse_expression_file(exp_file, exp_type) df = df.merge(exp_df, on="FEATURE_ID") # Reorder the columns in dataframe columns = ["FEATURE_ID", "GENE_SYMBOL", args.expressions_type] if args.norm_expressions_type: columns = columns + args.norm_expressions_type df = df[columns] # Replace NaN values with empty string df.fillna("", inplace=True) # Write to file df.to_csv( args.output_name + ".txt.gz", header=True, index=False, sep="\t", compression="gzip", ) # Write to JSON df_dict = df.set_index("FEATURE_ID").to_dict(orient="index") with open(args.output_name + ".json", "w") as f: json.dump({"genes": df_dict}, f, allow_nan=False)
genes[str(x[0])] = x[1:] return times, genes if file_name[-4:] == ".xls" or file_name[-5:] == ".xlsx": times, genes = import_excel(file_name) else: times, genes = import_table(file_name) etcjson = '{"etc":%s}' % json.dumps({ "genes": genes, "timePoints": times }, separators=(",", ":")) send_message( save( "etc", json.dumps({ "genes": genes, "timePoints": times }, separators=(",", ":")))) zipfile = gzip.GzipFile( filename="", mode="wb", fileobj=open("etc.json.gz", "wb"), mtime=0, ) zipfile.write(etcjson.encode("utf-8"))
import argparse import pandas as pd from pandas.errors import EmptyDataError from resolwe_runtime_utils import error, send_message parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-f", "--bed_file", help="Bed file.") args = parser.parse_args() try: df = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str) except EmptyDataError: send_message( error( f"The input BED file {args.bed_file} is empty. Your analysis might " f"have failed to identify regions of interest (peaks, junctions, etc.)." ) ) else: df.iloc[:, 4] = pd.to_numeric(df.iloc[:, 4]).round().astype(int) df.iloc[:, 4] = df.iloc[:, 4].clip(upper=1000) # if strand column exist replace '?' with '.' if len(df.columns) >= 6: df.iloc[:, 5] = df.iloc[:, 5].replace("?", ".") output_name = "_".join(["corrected", args.bed_file]) df.to_csv(output_name, sep="\t", index=False, header=False)
import pandas as pd from resolwe_runtime_utils import send_message, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-bed", "--bed_file", required=True, help="All splice junctions in BED12 format") parser.add_argument("-sj", "--novel_sj", required=True, help="Table of annotated novel splice junctions") if __name__ == "__main__": args = parser.parse_args() bed_file = args.bed_file if os.path.getsize(bed_file) == 0: send_message(warning("Bed file has no entries.")) os.rename(bed_file, "novel_sj.bed") sys.exit(0) bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str) novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str) bed_novel_sj = bed[bed[3].isin(novel_sj["name"])] bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
break if args.c: x_axis = data.iloc[:, 8][::-1] y_axis = data.iloc[:, 6] - data.iloc[:, 7] else: x_axis = data.iloc[:, 7][::-1] y_axis = data.iloc[:, 6] n_sup_enh, rows = data[data.isSuper == 1].shape chr_pos = data.CHROM.map(str) + ":" + data.START.map( str) + "-" + data.STOP.map(str) if len(x_axis) != len(y_axis): send_message(error("Scatter plot error. len(x_axis) != len(y_axis)")) if len(labels) > 0 and len(labels) != len(x_axis): send_message(error("Scatter plot error. len(labels) != len(x_axis)")) data = { "points": { "x_axis": list(x_axis), "y_axis": list(y_axis), "items": labels }, "annotations": [ { "type": "line", "x1": 0, "y1": float(cutoff),
import os from pysam import VariantFile from resolwe_runtime_utils import error, send_message, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("vcf_file", help="VCF file (can be compressed using gzip/bgzip).") parser.add_argument("summary", help="Summary file to append to.") args = parser.parse_args() try: vcf = VariantFile(args.vcf_file) except (OSError, ValueError) as error_msg: proc_error = "Input VCF file does not exist or could not be correctly opened." send_message(error(proc_error)) raise ValueError(error_msg) vcf_header = vcf.header header_records = {record.key: record.value for record in vcf_header.records} with open(args.summary, "a") as out_file: try: fasta_name = os.path.basename(header_records["reference"]) except KeyError: fasta_name = "" send_message( warning( "Reference sequence (FASTA) name could not be recognized from the VCF header." ))
def main(): """Invoke when run directly as a program.""" args = parse_arguments() res = resdk.Resolwe() with open(args.feature_ids) as gene_file: genes = [gene.strip() for gene in gene_file] org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id__in=genes) if len(org_features) == 0: send_message(error("No genes were fetched from the knowledge base.")) exit(1) if args.source_db == args.target_db: target_ids = genes else: mapping_res = res.mapping.filter( source_db=args.source_db, source_species=args.species, target_db=args.target_db, target_species=args.species, source_id__in=genes, ) if len(mapping_res) == 0: send_message(error("Failed to map features.")) exit(1) mappings = {} for m in mapping_res: if m.source_id in genes: if m.source_id not in mappings: mappings[m.source_id] = m.target_id else: send_message( warning( "Mapping {} returned multiple times.".format(m))) if len(genes) > len(mappings): send_message(warning("Not all features could be mapped.")) target_ids = mappings.values() with tempfile.NamedTemporaryFile() as input_genes: input_genes.write(" ".join(target_ids).encode("UTF-8")) input_genes.flush() process = Popen( [ "processor", str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name, ], stdout=PIPE, stderr=DEVNULL, ) out, err = process.communicate() with open("terms.json", "w") as f: f.write(out.decode("UTF-8"))
def set_error(msg): """Print error message and raise ValueError.""" send_message(error(msg)) raise ValueError(msg)
print('{"rc":"1"}') exit(1) def isfloat(value): """Check if value is float.""" try: float(value) return True except ValueError: return False with utils.gzopen(args.input) as f: # Split lines by tabs # Ignore lines without a number in second column # Build a dictionary of gene-expression pairs exp = { "genes": { gene_exp[0]: float(gene_exp[1]) for gene_exp in (l.split("\t") for l in f) if len(gene_exp) == 2 and isfloat(gene_exp[1]) } } if args.output: with open(args.output, "w") as f: json.dump(exp, f) else: send_message(save("exp_json", json.dumps(exp, separators=(",", ":"))))
def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps, progress_start): """Parse multiplexed file.""" pool_name = reads1_file.split(".")[0] def nicename(a): return a.replace("#", "").replace(" ", " ").replace("/", " ").replace(" ", "_") files, f1, f2, fbar = {}, None, None, None try: barcodes = set(pool_maps.keys()) print("BARCODES: {}".format(barcodes)) for barcode in barcodes: name = nicename(pool_maps[barcode]) if reads2_file: filename = "{}_{}_{}_mate1.fq.gz".format( pool_name, name, barcode) files[barcode] = gzip.open(filename, "wb") filename = "{}_{}_{}_mate2.fq.gz".format( pool_name, name, barcode) files[barcode + "2"] = gzip.open(filename, "wb") else: filename = "{}_{}_{}.fq.gz".format(pool_name, name, barcode) files[barcode] = gzip.open(filename, "wb") if reads2_file: files["notmatched"] = gzip.open( "Not_Matched_{}_mate1.fq.gz".format(pool_name), "wb") files["badquality"] = gzip.open( "Bad_Quality_{}_mate1.fq.gz".format(pool_name), "wb") files["notmatched2"] = gzip.open( "Not_Matched_{}_mate2.fq.gz".format(pool_name), "wb") files["badquality2"] = gzip.open( "Bad_Quality_{}_mate2.fq.gz".format(pool_name), "wb") else: files["notmatched"] = gzip.open( "Not_Matched_{}.fq.gz".format(pool_name), "wb") files["badquality"] = gzip.open( "Bad_Quality_{}.fq.gz".format(pool_name), "wb") filenames = list(sorted(set(f.name for f in files.values()))) p = subprocess.Popen( "gzip -dc {} | wc -l".format(barcodes_file), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) numlines, err = p.communicate() if err: raise Exception(err) numlines = int(numlines) readid, matched, notmatched, badquality, skipped = 0, 0, 0, 0, 0 send_message(progress(progress_start)) _progress = progress_start progress_step = (0.9 - _progress) / 20.0 progress_span = numlines / 20 def save_results(matched, notmatched, badquality, skipped, total, _progress): total = float(total) send_message( save( "matched", "{:,} reads ({:.2f} %)".format(matched, 100 * matched / total), )) send_message( save( "notmatched", "{:,} reads ({:.2f} %)".format(notmatched, 100 * notmatched / total), )) send_message( save( "badquality", "{:,} reads ({:.2f} %)".format(badquality, 100 * badquality / total), )) send_message( save( "skipped", "{:,} reads ({:.2f} %)".format(skipped, 100 * skipped / total), )) send_message(progress(_progress)) f1 = gzip.GzipFile(reads1_file, "r") fbar = gzip.GzipFile(barcodes_file, "r") if reads2_file: f2 = gzip.GzipFile(reads2_file, "r") while True: readid += 1 r1 = f1.readline() if not r1: break r1 = r1.decode("utf-8").rstrip("\r").rstrip("\n").split("\t") if len(r1) != 11: print("SKIPPED: error in {} line in r1".format(readid)) continue s1 = r1[-3].replace(".", "N") p1 = r1[-1] rbar = fbar.readline() if not rbar: break rbar = rbar.decode("utf-8").rstrip("\r").rstrip("\n").split("\t") if len(rbar) != 11: print("SKIPPED: error in {} line in rbar".format(readid)) continue sbar = rbar[-3].replace(".", "N")[:barcode_length] pbar = rbar[-1] if reads2_file: r2 = f2.readline() if not r2: break r2 = r2.decode("utf-8").rstrip("\r").rstrip("\n").split("\t") if len(r2) != 11: print("SKIPPED: error in {} line in r2".format(readid)) continue s2 = r2[-3].replace(".", "N") p2 = r2[-1] else: r2 = r1 p2 = p1 if r1[:7] == r2[:7] == rbar[:7] and p1 == p2 == pbar: idline = "@" + ":".join(r1[:7]) + " " + sbar if p1 == "1" and p2 == "1": if sbar in barcodes: files[sbar].write( (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] + "\n").encode("utf-8")) if reads2_file: files[sbar + "2"].write( (idline + "\n" + s2 + "\n" + "+" + "\n" + r2[-2] + "\n").encode("utf-8")) matched += 1 else: files["notmatched"].write( (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] + "\n").encode("utf-8")) if reads2_file: files["notmatched2"].write( (idline + "\n" + s2 + "\n" + "+" + "\n" + r2[-2] + "\n").encode("utf-8")) notmatched += 1 else: files["badquality"].write( (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] + "\n").encode("utf-8")) if reads2_file: files["badquality2"].write( (idline + "\n" + s2 + "\n" + "+" + "\n" + r2[-2] + "\n").encode("utf-8")) badquality += 1 else: print("SKIPPED: {}, p1: {}, p2: {}, pbar: {}".format( readid, p1, p2, pbar)) print("{} ? {} ? {}".format(r1[:7], r2[:7], rbar[:7])) skipped += 1 if readid % progress_span == 0: _progress += progress_step save_results(matched, notmatched, badquality, skipped, readid, _progress) save_results(matched, notmatched, badquality, skipped, readid, 0.9) finally: if f1: f1.close() if f2: f2.close() if fbar: fbar.close() for f in files: files[f].close() return filenames
t = line.split("\t") barcode, filename = "", "" if len(t) == 2: barcode, filename = t[0:2] if len(t) > 2 and isnum(t[0]): barcode, filename = t[1:3] barcode, filename = barcode.strip(), filename.strip() if barcode and filename: pool_maps[barcode] = filename if barcode_length > 0 and barcode_length != len(barcode): send_message( error("Barcodes should be of the same length.")) exit(1) else: barcode_length = len(barcode) for bar, _map in iteritems(pool_maps): print("{}: {}".format(bar, _map)) def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps, progress_start): """Parse multiplexed file.""" pool_name = reads1_file.split(".")[0] def nicename(a): return a.replace("#", "").replace(" ",
#!/usr/bin/env python3 """Check if sample names are unique.""" import argparse from resolwe_runtime_utils import error, send_message parser = argparse.ArgumentParser( description="Check if sample names are unique") parser.add_argument("samples", help="All samples") args = parser.parse_args() samples = args.samples.split(",") if len(samples) > len(set(samples)): send_message((error("Sample names must be unique.")))
def main(): """Compute sample hierarchical clustering.""" args = parse_args() if len(args.sample_files) != len(args.sample_ids): msg = "The number of sample files does not match the number of sample IDs." set_error(msg) if len(args.sample_files) != len(args.sample_names): msg = "The number of sample files does not match the number of sample names." set_error(msg) if len(args.sample_files) < 2: msg = ( "Select at least two samples to compute hierarchical clustering of samples." ) set_error(msg) if len(args.gene_labels) == 1 and args.distance_metric != "euclidean": msg = ( "Select at least two genes to compute hierarchical clustering of samples with " "correlation distance metric or use Euclidean distance metric.") set_error(msg) expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels) if len(expressions.index) == 0: if not args.gene_labels: msg = "The selected samples do not have any common genes." else: msg = "None of the selected genes are present in all samples." set_error(msg) if len(expressions.index) == 1 and args.distance_metric != "euclidean": if not args.gene_labels: msg = ( "The selected samples contain only one common gene ({}). At least two common " "genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select a different set of samples or use Euclidean " "distance metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) else: msg = ( "Only one of the selected genes ({}) is present in all samples but at least two " "such genes are required to compute hierarchical clustering of samples with " "correlation distance metric. Select more genes or use Euclidean distance " "metric.".format( get_gene_names(list(expressions.index), args.source, args.species)[0])) set_error(msg) expressions = transform(expressions, log2=args.log2, z_score=args.z_score) if args.remove_const: expressions, matches = remove_const_samples(expressions) if len(expressions.columns) == 0: msg = ( "All of the selected samples have constant expression across genes. Hierarchical " "clustering of samples cannot be computed.") set_error(msg) if len(expressions.columns) == 1: sample_name = [ id for i, id in enumerate(args.sample_names) if matches[i] ][0] msg = ( "Only one of the selected samples ({}) has a non-constant expression across " "genes. However, hierarchical clustering of samples cannot be computed with " "just one sample.".format(sample_name)) set_error(msg) removed = [ name for i, name in enumerate(args.sample_names) if not matches[i] ] suffix = "" if len(removed) <= 3 else ", ..." if removed: msg = ( "{} of the selected samples ({}) have constant expression across genes. " "Those samples are excluded from the computation of hierarchical clustering of " "samples with correlation distance " "metric.".format(len(removed), ", ".join(removed[:3]) + suffix)) send_message(warning(msg)) else: matches = [True] * len(args.sample_files) suffix = "" if len(excluded) <= 3 else ", ..." if excluded: excluded_names = get_gene_names(excluded[:3], args.source, args.species) if len(excluded) == 1: if not args.gene_labels: msg = ( "Gene {} is present in some but not all of the selected samples. This " "gene is excluded from the computation of hierarchical clustering of " "samples.".format(", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) is missing in at least one of the selected " "samples. This gene is excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) if len(excluded) > 1: if not args.gene_labels: msg = ( "{} genes ({}) are present in some but not all of the selected samples. Those " "genes are excluded from the computation of hierarchical clustering of " "samples.".format(len(excluded), ", ".join(excluded_names))) else: msg = ( "{} of the selected genes ({}) are missing in at least one of the selected " "samples. Those genes are excluded from the computation of hierarchical " "clustering of samples.".format(len(excluded), ", ".join(excluded_names))) send_message(warning(msg)) linkage, dendrogram = get_clustering( expressions, distance_metric=get_distance_metric(args.distance_metric), linkage_method=args.linkage_method, order=args.order, ) sample_ids = [ sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i] ] result = { "sample_ids": {i: { "id": sample_id } for i, sample_id in enumerate(sample_ids)}, "linkage": linkage.tolist(), "order": dendrogram["leaves"], } output_json(result, args.output)
def main(): """Invoke when run directly as a program.""" args = parse_arguments() de_data = pd.read_csv(args.raw_file, sep="\t") de_data.rename(columns={"Unnamed: 0": "gene_id"}, inplace=True) de_data.fillna(value=1, inplace=True) columns = {} col_order = [] # Make sure all listed numeric columns are valid numeric variables based # on a union of numeric column names from cuffdiff, edgeR, deseq2 and test # files. numeric_columns = [ "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "value_1", "value_2", "log2(fold_change)", "test_stat", "p_value", "q_value", "logfc", "fdr", "stat", "logFC", "logCPM", "LR", "Pvalue", "FDR", ] de_columns = de_data.columns for column in numeric_columns: if column not in de_columns: continue if not is_numeric_dtype(de_data[column]): msg = (f"Column {column} is not numeric. Please make sure " f"that the input file has valid numeric values (i.e. " f"periods for decimal places).") send_message(error(msg)) raise ValueError(msg) if args.gene_id: if args.gene_id == "index": columns["gene_id"] = list(de_data.index.astype(str)) col_order.append("gene_id") else: columns["gene_id"] = list(de_data[args.gene_id].astype(str)) col_order.append("gene_id") if args.logfc: col = np.array(de_data[args.logfc]) col[np.isinf(col)] = 0 columns["logfc"] = list(col) col_order.append("logfc") if args.fdr: columns["fdr"] = list(de_data[args.fdr]) col_order.append("fdr") if args.pvalue: columns["pvalue"] = list(de_data[args.pvalue]) col_order.append("pvalue") if args.fwer: columns["fwer"] = list(de_data[args.fwer]) col_order.append("fwer") if args.logodds: columns["logodds"] = list(de_data[args.logodds]) col_order.append("logodds") if args.stat: columns["stat"] = list(de_data[args.stat]) col_order.append("stat") with open(args.output_json, "w") as f: json.dump(columns, f, separators=(",", ":"), allow_nan=False) outdf = pd.DataFrame(columns) outdf = outdf[col_order] outdf.to_csv(args.output_file, sep="\t", index=False, compression="gzip")