def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    mappability = parse_mapability_file(args.mappability)
    expression = parse_expression_file(args.counts)

    missing_genes = expression.index.difference(mappability.index)
    if len(missing_genes) > 0:
        send_message(
            error("Feature ID {} is not present in the mappability file. "
                  "Make sure that the expressions and mappability file are "
                  "derived from the same annotations (GTF/GFF) file.".format(
                      missing_genes[0])))
        sys.exit(1)

    lib_size = expression.sum()

    result = 10**9 * expression / lib_size / mappability
    result[mappability == 0] = 0.0

    result.loc[expression.index].to_csv(
        args.output,
        index_label="Gene",
        header=["Expression"],
        sep="\t",
        compression="gzip",
    )
    def test_send_message(self):
        def _receive(server_socket, result):
            response = {'type_data': 'OK'}
            message_body = json.dumps(response).encode()
            message_header = "{length:0{size}d}".format(
                length=len(message_body), size=5
            ).encode("utf-8")
            message = message_header + message_body
            connection = sock.accept()[0]
            received = b""
            header_length = int(connection.recv(5))
            received = connection.recv(header_length)
            connection.send(message)
            result.append(received)

        result = []
        test_message = "Test data"
        temp_dir = tempfile.mkdtemp()
        try:
            socket_path = os.path.join(temp_dir, "socket.s")
            with patch("resolwe_runtime_utils.COMMUNICATOR_SOCKET", socket_path):
                sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
                sock.bind(socket_path)
                sock.listen(1)
                server_thread = Thread(target=_receive, args=(sock, result))
                server_thread.start()
                send_message(test_message)
                server_thread.join()
        finally:
            shutil.rmtree(temp_dir)

        self.assertEqual(test_message, json.loads(result[0].decode()))
Beispiel #3
0
def parse_expression_file(exp_file, exp_type):
    """Parse expression file to a Pandas dataframe."""
    with gzip.open(exp_file) as exp:
        df = pd.read_csv(exp, sep="\t")

        ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"]
        if not all(column_label in ALLOWED_COLUMNS
                   for column_label in df.columns.values):
            send_message(
                error("Invalid column headers {} in file {}.".format(
                    df.columns.values, exp_file)))
            sys.exit(1)

        df.rename(
            index=str,
            columns={
                "Gene": "FEATURE_ID",
                "Transcript": "FEATURE_ID",
                "Expression": exp_type,
            },
            inplace=True,
        )
        # Cast FEATURE_ID column to string
        df["FEATURE_ID"] = df["FEATURE_ID"].astype("str")
        # Remove any possible empty rows from the input file
        df.dropna(inplace=True)

    return df
Beispiel #4
0
 def save_results(matched, notmatched, badquality, skipped, total,
                  _progress):
     total = float(total)
     send_message(
         save(
             "matched",
             "{:,} reads ({:.2f} %)".format(matched,
                                            100 * matched / total),
         ))
     send_message(
         save(
             "notmatched",
             "{:,} reads ({:.2f} %)".format(notmatched,
                                            100 * notmatched / total),
         ))
     send_message(
         save(
             "badquality",
             "{:,} reads ({:.2f} %)".format(badquality,
                                            100 * badquality / total),
         ))
     send_message(
         save(
             "skipped",
             "{:,} reads ({:.2f} %)".format(skipped,
                                            100 * skipped / total),
         ))
     send_message(progress(_progress))
def validate_inputs(args):
    """Validate inputs."""
    # Validate that all expression types are equal.
    exp_type_set = set(args.exp_types)
    if len(exp_type_set) != 1:
        msg = "All samples should have the same expression type, but multiple expression types were given: {}."
        msg = msg.format(", ".join(exp_type_set))
        send_message(warning(msg))

    # Validate that same number of sample names, expression files and
    # expression types are given.
    assert len(args.sample_names) == len(args.sample_exps) == len(args.exp_types)
Beispiel #6
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    amplicon_names = set()

    with open(args.master_file, newline="") as masterfile:
        reader = csv.reader(masterfile, delimiter="\t")
        for row in reader:
            if len(row) != 12:
                send_message(
                    error(
                        "Uploaded master file must contain exactly 12 columns."
                    ))
            if not check_dna_sequence(row[10]):
                send_message(error("11th column must contain a DNA sequence."))
            if not check_dna_sequence(row[11]):
                send_message(error("12th column must contain a DNA sequence."))

            amp_name = row[3]
            if amp_name not in amplicon_names:
                amplicon_names.add(amp_name)
            else:
                send_message(
                    error(
                        "Amplicon names must be unique. Amplicon {} is seen multiple times."
                        .format(amp_name)))
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    validate_inputs(args)

    exp_type = args.exp_types[0]
    spikeins_mix = args.spikeins_mix

    expected = get_expected(spikeins_mix, log2=True)

    min_one_has_spikeins = False  # At least one sample has spikeins = False
    warnings = []
    for sample_name, sample_exp in zip(args.sample_names, args.sample_exps):

        measured_zero = get_measured(sample_exp, sample_name, exp_type, only_zero=True)
        measured_nonzero = get_measured(
            sample_exp, sample_name, exp_type, only_nonzero=True, log2=True
        )

        merged_zero = merge_expected_measured(expected, measured_zero)
        merged_nonzero = merge_expected_measured(expected, measured_nonzero)

        # Get only ERCC spike-in's and plot the histogram-scatter figure.
        if merged_nonzero.iloc[merged_nonzero.index.str.startswith("ERCC"), :].empty:
            warnings.append(
                "All ERCC spike-ins have zero expression in sample {}".format(
                    sample_name
                )
            )
            continue

        min_one_has_spikeins = True
        plot_histogram_scatter(
            expected=expected.iloc[expected.index.str.startswith("ERCC")],
            zero=merged_zero.iloc[merged_zero.index.str.startswith("ERCC"), :],
            nonzero=merged_nonzero.iloc[merged_nonzero.index.str.startswith("ERCC"), :],
            spikein_type="ERCC",
            sample_name=sample_name,
            exp_type=exp_type,
        )

    if min_one_has_spikeins:
        for message in warnings:
            send_message(warning(message))
    else:
        # In case all samples have zero expression for all spikeins,
        # rather print one warning that says so (instead of printing
        # warning for each of the samples).
        send_message(warning("All ERCC spike-ins in all samples have zero expression."))
Beispiel #8
0
def parse_mappings(species, infile, outfile):
    """Parse file with chromosome mappings."""
    mappings = dict()
    # if species doesn't have prepared mapping file the script should exit with status 0 and return BigWig file
    # with output name and warining
    if species not in MAPPINGS_FILES:
        msg = 'Chromosome mappings for Species "{}" are not supported.'.format(
            species)
        send_message(warning(msg))
        os.rename(infile, outfile)
        sys.exit(0)

    for basename in MAPPINGS_FILES[species]:
        filename = os.path.join(MAPPINGS_DIR, basename)
        mappings.update(parse_mapping_file(filename))
    return mappings
Beispiel #9
0
def get_pca(expressions=pd.DataFrame(), n_components=2, gene_labels=[]):
    """Compute PCA."""
    if not gene_labels:
        gene_labels = expressions.index
    skipped_gene_labels = list(set(gene_labels).difference(expressions.index))

    if expressions.shape[0] < 2 or expressions.shape[1] < 2:
        coordinates = [[0.0, 0.0] for i in range(expressions.shape[1])]
        all_components = [[], []]
        all_explained_variance_ratios = [0.0, 0.0]
    else:
        pca = PCA(n_components=n_components, whiten=True)
        pca_expressions = pca.fit_transform(expressions.transpose())

        coordinates = [
            t[:2].tolist() if len(t) > 1 else [t[0], 0.0]
            for t in pca_expressions
        ]
        all_components = [
            component_top_factors(component, gene_labels)
            for component in pca.components_
        ]
        if np.isnan(pca.explained_variance_ratio_).any():
            all_explained_variance_ratios = [
                0.0 for _ in pca.explained_variance_ratio_
            ]
        else:
            all_explained_variance_ratios = pca.explained_variance_ratio_.tolist(
            )

    result = {
        "coordinates": coordinates,
        "all_components": all_components,
        "all_explained_variance_ratios": all_explained_variance_ratios,
        "skipped_gene_labels": skipped_gene_labels,
        "warning": None,
    }

    if expressions.empty:
        send_message(
            warning(
                "Gene selection and filtering resulted in no genes. Please select different samples or genes."
            ))

    return result
Beispiel #10
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()
    gene_sets = create_gene_sets(args.dge_file, args.logfc, args.fdr)

    fname_prefix = generate_name(args.analysis_name, args.tool, args.logfc,
                                 args.fdr)

    out_dir = Path(args.out_dir)
    if not out_dir.exists():
        out_dir.mkdir()

    for name, data in gene_sets.items():
        if data.empty:
            send_message(
                warning(
                    f"No {name}-regulated genes. Gene set was not created."))
        else:
            save_genes(data, out_dir / f"{fname_prefix}_{name}.tab.gz")
Beispiel #11
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.geneset_file, "rU") as infile:
        # skip empty lines in input gene set file
        genes = [str(line.strip()) for line in infile if line.strip()]
        geneset = sorted(set(genes))

        if len(genes) != len(geneset):
            send_message(warning("Removed duplicated genes."))

        with open(args.output_json, "w") as json_out:
            json.dump({"genes": geneset},
                      json_out,
                      separators=(",", ":"),
                      allow_nan=False)

        with gzip.open(args.output_file, "w") as file_out:
            file_out.write("\n".join(geneset).encode("utf-8"))
Beispiel #12
0
def parse_mapability_file(mapability_file):
    """Parse mapability file to a Pandas Series."""
    try:
        mappability = pd.read_csv(
            mapability_file,
            sep="\t",
            usecols=["gene_id", "coverage"],
            index_col="gene_id",
            dtype={
                "gene_id": str,
                "coverage": float,
            },
            squeeze=True,
        )
        return mappability.dropna()
    except (ValueError, OSError) as parse_error:
        send_message(
            error("Failed to read mappability file {}. {}".format(
                basename(mapability_file), parse_error)))
        sys.exit(1)
Beispiel #13
0
def create_new_header(infile, mappings, outfile):
    """Create new header in BigWig, with UCSC chromosome names."""
    with pyBigWig.open(infile) as bw:
        if set(bw.chroms().keys()).issubset(mappings.values()):
            # If chromosome names are already UCSC, just rename input file to output name.
            # Exit with status 0 since this is normal behavior.
            os.rename(infile, outfile)
            sys.exit(0)

        hdr = [(mappings[chrom], length)
               for chrom, length in bw.chroms().items() if chrom in mappings]

        if not hdr:
            msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done."
            send_message(warning(msg))
            os.rename(infile, outfile)
            sys.exit(0)

        seq_num = 0
        with pyBigWig.open(outfile, "w") as bw_output:
            bw_output.addHeader(hdr)
            for chrom, length in bw.chroms().items():
                ints = bw.intervals(chrom, 0, length)
                if ints and chrom in mappings:
                    bw_output.addEntries(
                        [mappings[chrom]] * len(ints),
                        [x[0] for x in ints],
                        ends=[x[1] for x in ints],
                        values=[x[2] for x in ints],
                    )
                elif chrom not in mappings:
                    seq_num += 1
                    print("UCSC chromosome/conting mapping for {} is missing".
                          format(chrom))

        if seq_num > 0:
            send_message(
                warning(
                    "UCSC chromosome/conting mapping for {} sequence(s) is missing. "
                    "This sequence(s) will not be included in the bigWig file."
                    .format(seq_num)))
Beispiel #14
0
def parse_expression_file(exp_file):
    """Parse expression file to a Pandas Series."""
    try:
        expression = pd.read_csv(
            exp_file,
            sep="\t",
            compression="gzip",
            usecols=["Gene", "Expression"],
            index_col="Gene",
            dtype={
                "Gene": str,
                "Expression": float,
            },
            squeeze=True,
        )
        return expression.dropna()
    except (ValueError, OSError) as parse_error:
        send_message(
            error("Failed to read input file {}. {}".format(
                basename(exp_file), parse_error)))
        sys.exit(1)
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.input_file) as infile:
        data = json.load(infile)
        if "expected_format" in data and "compatible_fragment_ratio" in data:
            send_message(save("strandedness", data["expected_format"]))
            send_message(
                save("fragment_ratio",
                     str(round(data["compatible_fragment_ratio"], 2))))
        else:
            send_message(error("Cannot parse library type information file."))
Beispiel #16
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.norm_expressions and args.norm_expressions_type:
        if len(args.norm_expressions) != len(args.norm_expressions_type):
            send_message(
                error(
                    "The number of additional expression files must match the number of specified "
                    "expressions types."))
            sys.exit(1)

    if args.norm_expressions_type:
        exp_types = [args.expressions_type] + args.norm_expressions_type
        if len(exp_types) != len(set(exp_types)):
            send_message(
                error(
                    "The union of the main expression type ({}) and additional normalized expression types {} "
                    "does not contain unique items.".format(
                        args.expressions_type, args.norm_expressions_type)))
            sys.exit(1)

    res = resdk.Resolwe()

    feature_dict = {}
    df = parse_expression_file(args.expressions, args.expressions_type)

    # Get a list of feature IDs
    input_features = df["FEATURE_ID"].tolist()

    # Split feature IDs into chunks with max size of 10000 elements
    features_sublists = [
        input_features[i:i + CHUNK_SIZE]
        for i in range(0, len(input_features), CHUNK_SIZE)
    ]

    # Fetch features from KB and add them to {feature_id: feature_name} mapping dict
    for fsublist in features_sublists:
        features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id__in=fsublist)
        feature_dict.update({f.feature_id: f.name for f in features})

    # Map gene symbols to feature IDs
    df["GENE_SYMBOL"] = df["FEATURE_ID"].map(feature_dict)

    # Check if all of the input feature IDs could be mapped to the gene symbols
    if not all(f_id in feature_dict for f_id in input_features):
        send_message(
            warning(
                "{} feature(s) could not be mapped to the associated feature symbols."
                .format(sum(df.isnull().values.ravel()))))

    # Merge additional expression files with the original data frame
    if args.norm_expressions and args.norm_expressions_type:
        for exp_file, exp_type in zip(args.norm_expressions,
                                      args.norm_expressions_type):
            exp_df = parse_expression_file(exp_file, exp_type)
            df = df.merge(exp_df, on="FEATURE_ID")

    # Reorder the columns in dataframe
    columns = ["FEATURE_ID", "GENE_SYMBOL", args.expressions_type]
    if args.norm_expressions_type:
        columns = columns + args.norm_expressions_type
    df = df[columns]

    # Replace NaN values with empty string
    df.fillna("", inplace=True)

    # Write to file
    df.to_csv(
        args.output_name + ".txt.gz",
        header=True,
        index=False,
        sep="\t",
        compression="gzip",
    )

    # Write to JSON
    df_dict = df.set_index("FEATURE_ID").to_dict(orient="index")
    with open(args.output_name + ".json", "w") as f:
        json.dump({"genes": df_dict}, f, allow_nan=False)
Beispiel #17
0
            genes[str(x[0])] = x[1:]
    return times, genes


if file_name[-4:] == ".xls" or file_name[-5:] == ".xlsx":
    times, genes = import_excel(file_name)

else:
    times, genes = import_table(file_name)

etcjson = '{"etc":%s}' % json.dumps({
    "genes": genes,
    "timePoints": times
},
                                    separators=(",", ":"))
send_message(
    save(
        "etc",
        json.dumps({
            "genes": genes,
            "timePoints": times
        },
                   separators=(",", ":"))))
zipfile = gzip.GzipFile(
    filename="",
    mode="wb",
    fileobj=open("etc.json.gz", "wb"),
    mtime=0,
)
zipfile.write(etcjson.encode("utf-8"))
import argparse

import pandas as pd
from pandas.errors import EmptyDataError
from resolwe_runtime_utils import error, send_message

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--bed_file", help="Bed file.")
args = parser.parse_args()

try:
    df = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str)
except EmptyDataError:
    send_message(
        error(
            f"The input BED file {args.bed_file} is empty. Your analysis might "
            f"have failed to identify regions of interest (peaks, junctions, etc.)."
        )
    )
else:
    df.iloc[:, 4] = pd.to_numeric(df.iloc[:, 4]).round().astype(int)
    df.iloc[:, 4] = df.iloc[:, 4].clip(upper=1000)

    # if strand column exist replace '?' with '.'
    if len(df.columns) >= 6:
        df.iloc[:, 5] = df.iloc[:, 5].replace("?", ".")

    output_name = "_".join(["corrected", args.bed_file])
    df.to_csv(output_name, sep="\t", index=False, header=False)
import pandas as pd
from resolwe_runtime_utils import send_message, warning

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument("-bed",
                    "--bed_file",
                    required=True,
                    help="All splice junctions in BED12 format")
parser.add_argument("-sj",
                    "--novel_sj",
                    required=True,
                    help="Table of annotated novel splice junctions")

if __name__ == "__main__":

    args = parser.parse_args()
    bed_file = args.bed_file

    if os.path.getsize(bed_file) == 0:
        send_message(warning("Bed file has no entries."))
        os.rename(bed_file, "novel_sj.bed")
        sys.exit(0)

    bed = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str)
    novel_sj = pd.read_csv(args.novel_sj, delimiter="\t", dtype=str)
    bed_novel_sj = bed[bed[3].isin(novel_sj["name"])]

    bed_novel_sj.to_csv("novel_sj.bed", sep="\t", index=False, header=False)
Beispiel #20
0
        break

if args.c:
    x_axis = data.iloc[:, 8][::-1]
    y_axis = data.iloc[:, 6] - data.iloc[:, 7]
else:
    x_axis = data.iloc[:, 7][::-1]
    y_axis = data.iloc[:, 6]

n_sup_enh, rows = data[data.isSuper == 1].shape

chr_pos = data.CHROM.map(str) + ":" + data.START.map(
    str) + "-" + data.STOP.map(str)

if len(x_axis) != len(y_axis):
    send_message(error("Scatter plot error. len(x_axis) != len(y_axis)"))

if len(labels) > 0 and len(labels) != len(x_axis):
    send_message(error("Scatter plot error. len(labels) != len(x_axis)"))

data = {
    "points": {
        "x_axis": list(x_axis),
        "y_axis": list(y_axis),
        "items": labels
    },
    "annotations": [
        {
            "type": "line",
            "x1": 0,
            "y1": float(cutoff),
import os

from pysam import VariantFile
from resolwe_runtime_utils import error, send_message, warning

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("vcf_file",
                    help="VCF file (can be compressed using gzip/bgzip).")
parser.add_argument("summary", help="Summary file to append to.")
args = parser.parse_args()

try:
    vcf = VariantFile(args.vcf_file)
except (OSError, ValueError) as error_msg:
    proc_error = "Input VCF file does not exist or could not be correctly opened."
    send_message(error(proc_error))
    raise ValueError(error_msg)

vcf_header = vcf.header
header_records = {record.key: record.value for record in vcf_header.records}

with open(args.summary, "a") as out_file:
    try:
        fasta_name = os.path.basename(header_records["reference"])
    except KeyError:
        fasta_name = ""
        send_message(
            warning(
                "Reference sequence (FASTA) name could not be recognized from the VCF header."
            ))
Beispiel #22
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    res = resdk.Resolwe()

    with open(args.feature_ids) as gene_file:
        genes = [gene.strip() for gene in gene_file]

    org_features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id__in=genes)

    if len(org_features) == 0:
        send_message(error("No genes were fetched from the knowledge base."))
        exit(1)

    if args.source_db == args.target_db:
        target_ids = genes
    else:
        mapping_res = res.mapping.filter(
            source_db=args.source_db,
            source_species=args.species,
            target_db=args.target_db,
            target_species=args.species,
            source_id__in=genes,
        )

        if len(mapping_res) == 0:
            send_message(error("Failed to map features."))
            exit(1)

        mappings = {}
        for m in mapping_res:
            if m.source_id in genes:
                if m.source_id not in mappings:
                    mappings[m.source_id] = m.target_id
                else:
                    send_message(
                        warning(
                            "Mapping {} returned multiple times.".format(m)))

        if len(genes) > len(mappings):
            send_message(warning("Not all features could be mapped."))

        target_ids = mappings.values()

    with tempfile.NamedTemporaryFile() as input_genes:
        input_genes.write(" ".join(target_ids).encode("UTF-8"))
        input_genes.flush()
        process = Popen(
            [
                "processor",
                str(args.pval),
                str(args.min_genes),
                args.obo,
                args.gaf,
                input_genes.name,
            ],
            stdout=PIPE,
            stderr=DEVNULL,
        )
        out, err = process.communicate()

        with open("terms.json", "w") as f:
            f.write(out.decode("UTF-8"))
Beispiel #23
0
def set_error(msg):
    """Print error message and raise ValueError."""
    send_message(error(msg))
    raise ValueError(msg)
    print('{"rc":"1"}')
    exit(1)


def isfloat(value):
    """Check if value is float."""
    try:
        float(value)
        return True
    except ValueError:
        return False


with utils.gzopen(args.input) as f:
    # Split lines by tabs
    # Ignore lines without a number in second column
    # Build a dictionary of gene-expression pairs
    exp = {
        "genes": {
            gene_exp[0]: float(gene_exp[1])
            for gene_exp in (l.split("\t") for l in f)
            if len(gene_exp) == 2 and isfloat(gene_exp[1])
        }
    }

if args.output:
    with open(args.output, "w") as f:
        json.dump(exp, f)
else:
    send_message(save("exp_json", json.dumps(exp, separators=(",", ":"))))
Beispiel #25
0
def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps,
                     progress_start):
    """Parse multiplexed file."""
    pool_name = reads1_file.split(".")[0]

    def nicename(a):
        return a.replace("#", "").replace("  ",
                                          " ").replace("/",
                                                       " ").replace(" ", "_")

    files, f1, f2, fbar = {}, None, None, None
    try:
        barcodes = set(pool_maps.keys())
        print("BARCODES: {}".format(barcodes))

        for barcode in barcodes:
            name = nicename(pool_maps[barcode])
            if reads2_file:
                filename = "{}_{}_{}_mate1.fq.gz".format(
                    pool_name, name, barcode)
                files[barcode] = gzip.open(filename, "wb")

                filename = "{}_{}_{}_mate2.fq.gz".format(
                    pool_name, name, barcode)
                files[barcode + "2"] = gzip.open(filename, "wb")

            else:
                filename = "{}_{}_{}.fq.gz".format(pool_name, name, barcode)
                files[barcode] = gzip.open(filename, "wb")

        if reads2_file:
            files["notmatched"] = gzip.open(
                "Not_Matched_{}_mate1.fq.gz".format(pool_name), "wb")
            files["badquality"] = gzip.open(
                "Bad_Quality_{}_mate1.fq.gz".format(pool_name), "wb")
            files["notmatched2"] = gzip.open(
                "Not_Matched_{}_mate2.fq.gz".format(pool_name), "wb")
            files["badquality2"] = gzip.open(
                "Bad_Quality_{}_mate2.fq.gz".format(pool_name), "wb")
        else:
            files["notmatched"] = gzip.open(
                "Not_Matched_{}.fq.gz".format(pool_name), "wb")
            files["badquality"] = gzip.open(
                "Bad_Quality_{}.fq.gz".format(pool_name), "wb")

        filenames = list(sorted(set(f.name for f in files.values())))

        p = subprocess.Popen(
            "gzip -dc {} | wc -l".format(barcodes_file),
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        numlines, err = p.communicate()

        if err:
            raise Exception(err)

        numlines = int(numlines)
        readid, matched, notmatched, badquality, skipped = 0, 0, 0, 0, 0
        send_message(progress(progress_start))
        _progress = progress_start
        progress_step = (0.9 - _progress) / 20.0
        progress_span = numlines / 20

        def save_results(matched, notmatched, badquality, skipped, total,
                         _progress):
            total = float(total)
            send_message(
                save(
                    "matched",
                    "{:,} reads ({:.2f} %)".format(matched,
                                                   100 * matched / total),
                ))
            send_message(
                save(
                    "notmatched",
                    "{:,} reads ({:.2f} %)".format(notmatched,
                                                   100 * notmatched / total),
                ))
            send_message(
                save(
                    "badquality",
                    "{:,} reads ({:.2f} %)".format(badquality,
                                                   100 * badquality / total),
                ))
            send_message(
                save(
                    "skipped",
                    "{:,} reads ({:.2f} %)".format(skipped,
                                                   100 * skipped / total),
                ))
            send_message(progress(_progress))

        f1 = gzip.GzipFile(reads1_file, "r")
        fbar = gzip.GzipFile(barcodes_file, "r")

        if reads2_file:
            f2 = gzip.GzipFile(reads2_file, "r")

        while True:
            readid += 1
            r1 = f1.readline()
            if not r1:
                break
            r1 = r1.decode("utf-8").rstrip("\r").rstrip("\n").split("\t")
            if len(r1) != 11:
                print("SKIPPED: error in {} line in r1".format(readid))
                continue
            s1 = r1[-3].replace(".", "N")
            p1 = r1[-1]

            rbar = fbar.readline()
            if not rbar:
                break
            rbar = rbar.decode("utf-8").rstrip("\r").rstrip("\n").split("\t")
            if len(rbar) != 11:
                print("SKIPPED: error in {} line in rbar".format(readid))
                continue
            sbar = rbar[-3].replace(".", "N")[:barcode_length]
            pbar = rbar[-1]

            if reads2_file:
                r2 = f2.readline()
                if not r2:
                    break
                r2 = r2.decode("utf-8").rstrip("\r").rstrip("\n").split("\t")
                if len(r2) != 11:
                    print("SKIPPED: error in {} line in r2".format(readid))
                    continue
                s2 = r2[-3].replace(".", "N")
                p2 = r2[-1]
            else:
                r2 = r1
                p2 = p1

            if r1[:7] == r2[:7] == rbar[:7] and p1 == p2 == pbar:
                idline = "@" + ":".join(r1[:7]) + " " + sbar
                if p1 == "1" and p2 == "1":
                    if sbar in barcodes:
                        files[sbar].write(
                            (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] +
                             "\n").encode("utf-8"))
                        if reads2_file:
                            files[sbar + "2"].write(
                                (idline + "\n" + s2 + "\n" + "+" + "\n" +
                                 r2[-2] + "\n").encode("utf-8"))
                        matched += 1
                    else:
                        files["notmatched"].write(
                            (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] +
                             "\n").encode("utf-8"))
                        if reads2_file:
                            files["notmatched2"].write(
                                (idline + "\n" + s2 + "\n" + "+" + "\n" +
                                 r2[-2] + "\n").encode("utf-8"))
                        notmatched += 1
                else:
                    files["badquality"].write(
                        (idline + "\n" + s1 + "\n" + "+" + "\n" + r1[-2] +
                         "\n").encode("utf-8"))
                    if reads2_file:
                        files["badquality2"].write(
                            (idline + "\n" + s2 + "\n" + "+" + "\n" + r2[-2] +
                             "\n").encode("utf-8"))
                    badquality += 1
            else:
                print("SKIPPED: {}, p1: {}, p2: {}, pbar: {}".format(
                    readid, p1, p2, pbar))
                print("{} ? {} ? {}".format(r1[:7], r2[:7], rbar[:7]))
                skipped += 1

            if readid % progress_span == 0:
                _progress += progress_step
                save_results(matched, notmatched, badquality, skipped, readid,
                             _progress)

        save_results(matched, notmatched, badquality, skipped, readid, 0.9)

    finally:
        if f1:
            f1.close()
        if f2:
            f2.close()
        if fbar:
            fbar.close()

        for f in files:
            files[f].close()

    return filenames
Beispiel #26
0
            t = line.split("\t")
            barcode, filename = "", ""

            if len(t) == 2:
                barcode, filename = t[0:2]

            if len(t) > 2 and isnum(t[0]):
                barcode, filename = t[1:3]

            barcode, filename = barcode.strip(), filename.strip()

            if barcode and filename:
                pool_maps[barcode] = filename

                if barcode_length > 0 and barcode_length != len(barcode):
                    send_message(
                        error("Barcodes should be of the same length."))
                    exit(1)
                else:
                    barcode_length = len(barcode)

for bar, _map in iteritems(pool_maps):
    print("{}: {}".format(bar, _map))


def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps,
                     progress_start):
    """Parse multiplexed file."""
    pool_name = reads1_file.split(".")[0]

    def nicename(a):
        return a.replace("#", "").replace("  ",
Beispiel #27
0
#!/usr/bin/env python3
"""Check if sample names are unique."""
import argparse

from resolwe_runtime_utils import error, send_message

parser = argparse.ArgumentParser(
    description="Check if sample names are unique")
parser.add_argument("samples", help="All samples")
args = parser.parse_args()

samples = args.samples.split(",")

if len(samples) > len(set(samples)):
    send_message((error("Sample names must be unique.")))
Beispiel #28
0
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_ids):
        msg = "The number of sample files does not match the number of sample IDs."
        set_error(msg)

    if len(args.sample_files) != len(args.sample_names):
        msg = "The number of sample files does not match the number of sample names."
        set_error(msg)

    if len(args.sample_files) < 2:
        msg = (
            "Select at least two samples to compute hierarchical clustering of samples."
        )
        set_error(msg)

    if len(args.gene_labels) == 1 and args.distance_metric != "euclidean":
        msg = (
            "Select at least two genes to compute hierarchical clustering of samples with "
            "correlation distance metric or use Euclidean distance metric.")
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files,
                                            gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = "The selected samples do not have any common genes."
        else:
            msg = "None of the selected genes are present in all samples."
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != "euclidean":
        if not args.gene_labels:
            msg = (
                "The selected samples contain only one common gene ({}). At least two common "
                "genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select a different set of samples or use Euclidean "
                "distance metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        else:
            msg = (
                "Only one of the selected genes ({}) is present in all samples but at least two "
                "such genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select more genes or use Euclidean distance "
                "metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_samples(expressions)
        if len(expressions.columns) == 0:
            msg = (
                "All of the selected samples have constant expression across genes. Hierarchical "
                "clustering of samples cannot be computed.")
            set_error(msg)
        if len(expressions.columns) == 1:
            sample_name = [
                id for i, id in enumerate(args.sample_names) if matches[i]
            ][0]
            msg = (
                "Only one of the selected samples ({}) has a non-constant expression across "
                "genes. However, hierarchical clustering of samples cannot be computed with "
                "just one sample.".format(sample_name))
            set_error(msg)
        removed = [
            name for i, name in enumerate(args.sample_names) if not matches[i]
        ]
        suffix = "" if len(removed) <= 3 else ", ..."
        if removed:
            msg = (
                "{} of the selected samples ({}) have constant expression across genes. "
                "Those samples are excluded from the computation of hierarchical clustering of "
                "samples with correlation distance "
                "metric.".format(len(removed),
                                 ", ".join(removed[:3]) + suffix))
            send_message(warning(msg))
    else:
        matches = [True] * len(args.sample_files)

    suffix = "" if len(excluded) <= 3 else ", ..."
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source,
                                        args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = (
                "Gene {} is present in some but not all of the selected samples. This "
                "gene is excluded from the computation of hierarchical clustering of "
                "samples.".format(", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) is missing in at least one of the selected "
                "samples. This gene is excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = (
                "{} genes ({}) are present in some but not all of the selected samples. Those "
                "genes are excluded from the computation of hierarchical clustering of "
                "samples.".format(len(excluded), ", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) are missing in at least one of the selected "
                "samples. Those genes are excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))

    linkage, dendrogram = get_clustering(
        expressions,
        distance_metric=get_distance_metric(args.distance_metric),
        linkage_method=args.linkage_method,
        order=args.order,
    )

    sample_ids = [
        sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]
    ]
    result = {
        "sample_ids":
        {i: {
            "id": sample_id
        }
         for i, sample_id in enumerate(sample_ids)},
        "linkage": linkage.tolist(),
        "order": dendrogram["leaves"],
    }
    output_json(result, args.output)
Beispiel #29
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    de_data = pd.read_csv(args.raw_file, sep="\t")
    de_data.rename(columns={"Unnamed: 0": "gene_id"}, inplace=True)
    de_data.fillna(value=1, inplace=True)
    columns = {}
    col_order = []

    # Make sure all listed numeric columns are valid numeric variables based
    # on a union of numeric column names from cuffdiff, edgeR, deseq2 and test
    # files.
    numeric_columns = [
        "baseMean",
        "log2FoldChange",
        "lfcSE",
        "stat",
        "pvalue",
        "padj",
        "value_1",
        "value_2",
        "log2(fold_change)",
        "test_stat",
        "p_value",
        "q_value",
        "logfc",
        "fdr",
        "stat",
        "logFC",
        "logCPM",
        "LR",
        "Pvalue",
        "FDR",
    ]
    de_columns = de_data.columns

    for column in numeric_columns:
        if column not in de_columns:
            continue

        if not is_numeric_dtype(de_data[column]):
            msg = (f"Column {column} is not numeric. Please make sure "
                   f"that the input file has valid numeric values (i.e. "
                   f"periods for decimal places).")
            send_message(error(msg))
            raise ValueError(msg)

    if args.gene_id:
        if args.gene_id == "index":
            columns["gene_id"] = list(de_data.index.astype(str))
            col_order.append("gene_id")
        else:
            columns["gene_id"] = list(de_data[args.gene_id].astype(str))
            col_order.append("gene_id")

    if args.logfc:
        col = np.array(de_data[args.logfc])
        col[np.isinf(col)] = 0
        columns["logfc"] = list(col)
        col_order.append("logfc")

    if args.fdr:
        columns["fdr"] = list(de_data[args.fdr])
        col_order.append("fdr")

    if args.pvalue:
        columns["pvalue"] = list(de_data[args.pvalue])
        col_order.append("pvalue")

    if args.fwer:
        columns["fwer"] = list(de_data[args.fwer])
        col_order.append("fwer")

    if args.logodds:
        columns["logodds"] = list(de_data[args.logodds])
        col_order.append("logodds")

    if args.stat:
        columns["stat"] = list(de_data[args.stat])
        col_order.append("stat")

    with open(args.output_json, "w") as f:
        json.dump(columns, f, separators=(",", ":"), allow_nan=False)

    outdf = pd.DataFrame(columns)
    outdf = outdf[col_order]
    outdf.to_csv(args.output_file, sep="\t", index=False, compression="gzip")