Beispiel #1
0
def transform(expressions, log2=False, const=1.0, normalization=None, ddof=1):
    """Compute log2 and normalize expression values.

    Parameters:
    - const: an additive constant used in computation of log2
    - normalization: None or 'z-score'
    - ddof: degrees of freedom used in computation of z-scores

    """
    if log2:
        try:
            expressions = expressions.applymap(lambda x: np.log2(x + const))
        except:
            msg = 'Cannot apply log2 to expression values.'
            print(error(msg))
            raise ValueError(msg)
    if normalization:
        if normalization == 'z-score':
            try:
                expressions = expressions.apply(lambda x: zscore(x, ddof=ddof),
                                                axis=0)
            except:
                msg = 'Cannot compute Z-scores.'
                print(error(msg))
                raise ValueError(msg)
        else:
            msg = 'Unknown normalization type {}.'.format(normalization)
            print(error(msg))
            raise ValueError(msg)
    return expressions
Beispiel #2
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    amplicon_names = set()

    with open(args.master_file, newline="") as masterfile:
        reader = csv.reader(masterfile, delimiter="\t")
        for row in reader:
            if len(row) != 12:
                print(
                    error(
                        "Uploaded master file must contain exactly 12 columns."
                    ))
            if not check_dna_sequence(row[10]):
                print(error("11th column must contain a DNA sequence."))
            if not check_dna_sequence(row[11]):
                print(error("12th column must contain a DNA sequence."))

            amp_name = row[3]
            if amp_name not in amplicon_names:
                amplicon_names.add(amp_name)
            else:
                print(
                    error(
                        "Amplicon names must be unique. Amplicon {} is seen multiple times."
                        .format(amp_name)))
Beispiel #3
0
    def start(self, inputs):
        """Start the process.

        :param inputs: An instance of `Inputs` describing the process inputs
        :return: An instance of `Outputs` describing the process outputs
        """
        self.logger.info("Process is starting")

        outputs = Outputs(self._meta.outputs)

        self.logger.info("Process is running")
        try:
            self.run(inputs.freeze(), outputs)
            return outputs.freeze()
        except Exception as error:
            self.logger.exception("Exception while running process")
            print(resolwe_runtime_utils.error(str(error)))
            raise
        except:  # noqa
            self.logger.exception("Exception while running process")
            print(
                resolwe_runtime_utils.error("Exception while running process"))
            raise
        finally:
            self.logger.info("Process has finished")
Beispiel #4
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    res = resdk.Resolwe()

    with open(args.feature_ids) as gene_file:
        genes = [gene.strip() for gene in gene_file]

    org_features = res.feature.filter(source=args.source_db, species=args.species, feature_id=genes)

    if len(org_features) == 0:
        print(error("No genes were fetched from the knowledge base."))
        exit(1)

    if args.source_db == args.target_db:
        target_ids = genes
    else:
        mapping_res = res.mapping.filter(
            source_db=args.source_db,
            source_species=args.species,
            target_db=args.target_db,
            target_species=args.species,
            source_id=genes,
        )

        if len(mapping_res) == 0:
            print(error("Failed to map features."))
            exit(1)

        mappings = {}
        for m in mapping_res:
            if m.source_id in genes:
                if m.source_id not in mappings:
                    mappings[m.source_id] = m.target_id
                else:
                    print(warning("Mapping {} returned multiple times.".format(m)))

        if len(genes) > len(mappings):
            print(warning("Not all features could be mapped."))

        target_ids = mappings.values()

    with tempfile.NamedTemporaryFile() as input_genes:
        input_genes.write(' '.join(target_ids).encode("UTF-8"))
        input_genes.flush()
        process = Popen(['processor', str(args.pval), str(args.min_genes), args.obo, args.gaf, input_genes.name],
                        stdout=PIPE,
                        stderr=DEVNULL
                        )
        out, err = process.communicate()

        with open('terms.json', 'w') as f:
            f.write(out.decode("UTF-8"))
def parse_expression_file(exp_file, exp_type):
    """Parse expression file to a Pandas dataframe."""
    with gzip.open(exp_file) as exp:
        df = pd.read_csv(exp, sep='\t')

        ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"]
        if not all(column_label in ALLOWED_COLUMNS
                   for column_label in df.columns.values):
            print(
                error('Invalid column headers {} in file {}.'.format(
                    df.columns.values, exp_file)))
            sys.exit(1)

        df.rename(index=str,
                  columns={
                      "Gene": "FEATURE_ID",
                      "Transcript": "FEATURE_ID",
                      "Expression": exp_type,
                  },
                  inplace=True)
        # Cast FEATURE_ID column to string
        df['FEATURE_ID'] = df['FEATURE_ID'].astype('str')
        # Remove any possible empty rows from the input file
        df.dropna(inplace=True)

    return df
 def test_string(self):
     expected = {
         'type': 'COMMAND',
         'type_data': 'process_log',
         'data': {'error': 'Some error'},
     }
     self.assertEqual(error('Some error'), expected)
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    mappability = parse_mapability_file(args.mappability)
    expression = parse_expression_file(args.counts)

    missing_genes = expression.index.difference(mappability.index)
    if len(missing_genes) > 0:
        send_message(
            error("Feature ID {} is not present in the mappability file. "
                  "Make sure that the expressions and mappability file are "
                  "derived from the same annotations (GTF/GFF) file.".format(
                      missing_genes[0])))
        sys.exit(1)

    lib_size = expression.sum()

    result = 10**9 * expression / lib_size / mappability
    result[mappability == 0] = 0.0

    result.loc[expression.index].to_csv(
        args.output,
        index_label="Gene",
        header=["Expression"],
        sep="\t",
        compression="gzip",
    )
Beispiel #8
0
def get_clustering(expressions,
                   distance_metric='euclidean',
                   linkage_method='average',
                   ordering_method=None,
                   n_keep=None,
                   n_trials=1000):
    """Compute linkage, order, and produce a dendrogram."""
    if len(expressions.columns) < 2:
        return np.array([]), {'leaves': list(range(len(expressions.columns)))}
    try:
        distances = pdist(np.transpose(np.array(expressions)),
                          metric=distance_metric)
        if np.isnan(distances).any():
            distances = np.nan_to_num(distances, copy=False)
            warning(
                'Distances between some samples were undefined and were set to zero.'
            )
    except:
        msg = 'Cannot compute distances between samples.'
        print(error(msg))
        raise ValueError(msg)
    try:
        link = linkage(y=distances, method=linkage_method)
    except:
        msg = 'Cannot compute linkage.'
        print(error(msg))
        raise ValueError(msg)
    if ordering_method:
        if ordering_method == 'knn':
            link = knn(link, distances)
        elif ordering_method == 'optimal':
            link = optimal(link, distances, n_keep)
        elif ordering_method == 'sa':
            link = simulated_annealing(link, distances, n_trials)
        else:
            msg = 'Unknown ordering method {}'.format(ordering_method)
            print(error(msg))
            raise ValueError(msg)
    try:
        dend = dendrogram(link, no_plot=True)
    except:
        msg = 'Cannot compute dendrogram.'
        print(error(msg))
        raise ValueError(msg)
    return link, dend
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.input_file) as infile:
        data = json.load(infile)
        if 'expected_format' in data and 'compatible_fragment_ratio' in data:
            print(save('strandedness', data['expected_format']))
            print(save('fragment_ratio', str(round(data['compatible_fragment_ratio'], 2))))
        else:
            print(error("Cannot parse library type information file."))
def get_clustering(expressions,
                   distance_metric='euclidean',
                   linkage_method='average',
                   order=False):
    """Compute linkage, order, and produce a dendrogram."""
    try:
        link = linkage(y=expressions.transpose(),
                       method=linkage_method,
                       optimal_ordering=order)
    except:
        msg = 'Cannot compute linkage.'
        print(error(msg))
        raise ValueError(msg)
    try:
        dend = dendrogram(link, no_plot=True)
    except:
        msg = 'Cannot compute dendrogram.'
        print(error(msg))
        raise ValueError(msg)
    return link, dend
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    amplicon_names = set()

    with open(args.master_file, newline='') as masterfile:
        reader = csv.reader(masterfile, delimiter='\t')
        for row in reader:
            if len(row) != 12:
                print(error('Uploaded master file must contain exactly 12 columns.'))
            if not check_dna_sequence(row[10]):
                print(error('11th column must contain a DNA sequence.'))
            if not check_dna_sequence(row[11]):
                print(error('12th column must contain a DNA sequence.'))

            amp_name = row[3]
            if amp_name not in amplicon_names:
                amplicon_names.add(amp_name)
            else:
                print(error('Amplicon names must be unique. Amplicon {} is seen multiple times.'.format(amp_name)))
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.input_file) as infile:
        data = json.load(infile)
        if "expected_format" in data and "compatible_fragment_ratio" in data:
            send_message(save("strandedness", data["expected_format"]))
            send_message(
                save("fragment_ratio",
                     str(round(data["compatible_fragment_ratio"], 2))))
        else:
            send_message(error("Cannot parse library type information file."))
Beispiel #13
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    with open(args.input_file) as infile:
        data = json.load(infile)
        if 'expected_format' in data and 'compatible_fragment_ratio' in data:
            print(save('strandedness', data['expected_format']))
            print(
                save('fragment_ratio',
                     str(round(data['compatible_fragment_ratio'], 2))))
        else:
            print(error("Cannot parse library type information file."))
def parse_mapability_file(mapability_file):
    """Parse mapability file to a Pandas Series."""
    try:
        mappability = pd.read_csv(
            mapability_file,
            sep='\t',
            usecols=['gene_id', 'coverage'],
            index_col='gene_id',
            dtype={
                'gene_id': str,
                'coverage': float,
            },
            squeeze=True,
        )
        return mappability.dropna()
    except (ValueError, OSError) as parse_error:
        print(
            error("Failed to read mappability file {}. {}".format(
                basename(exp_file), parse_error)))
        sys.exit(1)
Beispiel #15
0
def parse_mapability_file(mapability_file):
    """Parse mapability file to a Pandas Series."""
    try:
        mappability = pd.read_csv(
            mapability_file,
            sep="\t",
            usecols=["gene_id", "coverage"],
            index_col="gene_id",
            dtype={
                "gene_id": str,
                "coverage": float,
            },
            squeeze=True,
        )
        return mappability.dropna()
    except (ValueError, OSError) as parse_error:
        send_message(
            error("Failed to read mappability file {}. {}".format(
                basename(mapability_file), parse_error)))
        sys.exit(1)
Beispiel #16
0
def parse_expression_file(exp_file):
    """Parse expression file to a Pandas Series."""
    try:
        expression = pd.read_csv(
            exp_file,
            sep="\t",
            compression="gzip",
            usecols=["Gene", "Expression"],
            index_col="Gene",
            dtype={
                "Gene": str,
                "Expression": float,
            },
            squeeze=True,
        )
        return expression.dropna()
    except (ValueError, OSError) as parse_error:
        send_message(
            error("Failed to read input file {}. {}".format(
                basename(exp_file), parse_error)))
        sys.exit(1)
def parse_expression_file(exp_file, exp_type):
    """Parse expression file to a Pandas dataframe."""
    with gzip.open(exp_file) as exp:
        df = pd.read_csv(exp, sep='\t')

        ALLOWED_COLUMNS = ["Gene", "Transcript", "Expression"]
        if not all(column_label in ALLOWED_COLUMNS for column_label in df.columns.values):
            print(error('Invalid column headers {} in file {}.'.format(df.columns.values, exp_file)))
            sys.exit(1)

        df.rename(index=str, columns={
            "Gene": "FEATURE_ID",
            "Transcript": "FEATURE_ID",
            "Expression": exp_type,
        }, inplace=True)
        # Cast FEATURE_ID column to string
        df['FEATURE_ID'] = df['FEATURE_ID'].astype('str')
        # Remove any possible empty rows from the input file
        df.dropna(inplace=True)

    return df
def parse_expression_file(exp_file):
    """Parse expression file to a Pandas Series."""
    try:
        expression = pd.read_csv(
            exp_file,
            sep='\t',
            compression='gzip',
            usecols=['Gene', 'Expression'],
            index_col='Gene',
            dtype={
                'Gene': str,
                'Expression': float,
            },
            squeeze=True,
        )
        return expression.dropna()
    except (ValueError, OSError) as parse_error:
        print(
            error("Failed to read input file {}. {}".format(
                basename(exp_file), parse_error)))
        sys.exit(1)
def transform(expressions, log2=False, const=1.0, z_score=False, ddof=1):
    """Compute log2 and normalize expression values.

    Parameters:
    - log2: use log2(x+const) transformation
    - const: an additive constant used in computation of log2
    - z_score: use Z-score normalization
    - ddof: degrees of freedom used in computation of Z-score

    """
    if log2:
        expressions = expressions.applymap(lambda x: np.log2(x + const))
        if expressions.isnull().values.any():
            msg = 'Cannot apply log2 to expression values.'
            print(error(msg))
            raise ValueError(msg)
    if z_score:
        expressions = expressions.apply(lambda x: zscore(x, ddof=ddof),
                                        axis=1,
                                        result_type='broadcast')
        expressions.fillna(value=0.0, inplace=True)
    return expressions
Beispiel #20
0
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    res = resdk.Resolwe()

    with open(args.feature_ids) as gene_file:
        genes = [gene.strip() for gene in gene_file]

    org_features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id=genes)

    if len(org_features) == 0:
        print(error("No genes were fetched from the knowledge base."))
        exit(1)

    if args.source_db == args.target_db:
        target_ids = genes
    else:
        mapping_res = res.mapping.filter(
            source_db=args.source_db,
            source_species=args.species,
            target_db=args.target_db,
            target_species=args.species,
            source_id=genes,
        )

        if len(mapping_res) == 0:
            print(error("Failed to map features."))
            exit(1)

        mappings = {}
        for m in mapping_res:
            if m.source_id in genes:
                if m.source_id not in mappings:
                    mappings[m.source_id] = m.target_id
                else:
                    print(
                        warning(
                            "Mapping {} returned multiple times.".format(m)))

        if len(genes) > len(mappings):
            print(warning("Not all features could be mapped."))

        target_ids = mappings.values()

    with tempfile.NamedTemporaryFile() as input_genes:
        input_genes.write(' '.join(target_ids).encode("UTF-8"))
        input_genes.flush()
        process = Popen([
            'processor',
            str(args.pval),
            str(args.min_genes), args.obo, args.gaf, input_genes.name
        ],
                        stdout=PIPE,
                        stderr=DEVNULL)
        out, err = process.communicate()

        with open('terms.json', 'w') as f:
            f.write(out.decode("UTF-8"))
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.norm_expressions and args.norm_expressions_type:
        if len(args.norm_expressions) != len(args.norm_expressions_type):
            print(error('The number of additional expression files must match the number of specified '
                        'expressions types.'))
            sys.exit(1)

    if args.norm_expressions_type:
        exp_types = [args.expressions_type] + args.norm_expressions_type
        if len(exp_types) != len(set(exp_types)):
            print(error('The union of the main expression type ({}) and additional normalized expression types {} '
                        'does not contain unique items.'.format(args.expressions_type, args.norm_expressions_type)))
            sys.exit(1)

    res = resdk.Resolwe()

    feature_dict = {}
    df = parse_expression_file(args.expressions, args.expressions_type)

    # Get a list of feature IDs
    input_features = df['FEATURE_ID'].tolist()

    # Split feature IDs into chunks with max size of 10000 elements
    features_sublists = [input_features[i:i + CHUNK_SIZE] for i in range(0, len(input_features), CHUNK_SIZE)]

    # Fetch features from KB and add them to {feature_id: feature_name} mapping dict
    for fsublist in features_sublists:
        features = res.feature.filter(source=args.source_db, species=args.species, feature_id=fsublist)
        feature_dict.update({f.feature_id: f.name for f in features})

    # Map gene symbols to feature IDs
    df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict)

    # Check if all of the input feature IDs could be mapped to the gene symbols
    if not all(f_id in feature_dict for f_id in input_features):
        print(warning('{} feature(s) could not be mapped to the associated feature symbols.'.format(
            sum(df.isnull().values.ravel())))
        )

    # Merge additional expression files with the original data frame
    if args.norm_expressions and args.norm_expressions_type:
        for exp_file, exp_type in zip(args.norm_expressions, args.norm_expressions_type):
            exp_df = parse_expression_file(exp_file, exp_type)
            df = df.merge(exp_df, on='FEATURE_ID')

    # Reorder the columns in dataframe
    columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type]
    if args.norm_expressions_type:
        columns = columns + args.norm_expressions_type
    df = df[columns]

    # Replace NaN values with empty string
    df.fillna('', inplace=True)

    # Write to file
    df.to_csv(args.output_name + '.txt.gz', header=True, index=False, sep='\t', compression='gzip')

    # Write to JSON
    df_dict = df.set_index('FEATURE_ID').to_dict(orient='index')
    with open(args.output_name + '.json', 'w') as f:
        json.dump({'genes': df_dict}, f, allow_nan=False)
Beispiel #22
0
                    help='clustering linkage function')
parser.add_argument('--filter',
                    help="Filter genes with low expression",
                    action="store_true")

args = parser.parse_args()

distance_map = {
    'spearman': lambda x, y: 1 - spearmanr(x, y).correlation,
    'pearson': lambda x, y: 1 - np.corrcoef(x, y)[0][1],
    'euclidean': 'euclidean'
}

if args.dstfunc not in distance_map:
    msg = "Invalid distance function {}".format(args.dstfunc)
    print(error(msg))
    raise ValueError(msg)

if args.linkage not in ['average', 'single', 'complete']:
    msg = "Invalid clustering linkage function {}".format(args.linkage)
    print(error(msg))
    raise ValueError(msg)

if not args.sampleids or len(args.sampleids) != len(args.sample_files):
    msg = "Number of sample ids must match the number of files"
    print(error(msg))
    raise ValueError(msg)

# read data
matrix = []
gene_subset = set(args.genes) if args.genes else None
 def test_string(self):
     self.assertEqual(error('Some error'), '{"proc.error": "Some error"}')
def set_error(msg):
    """Print error message and raise ValueError."""
    print(error(msg))
    raise ValueError(msg)
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    if args.norm_expressions and args.norm_expressions_type:
        if len(args.norm_expressions) != len(args.norm_expressions_type):
            print(
                error(
                    'The number of additional expression files must match the number of specified '
                    'expressions types.'))
            sys.exit(1)

    if args.norm_expressions_type:
        exp_types = [args.expressions_type] + args.norm_expressions_type
        if len(exp_types) != len(set(exp_types)):
            print(
                error(
                    'The union of the main expression type ({}) and additional normalized expression types {} '
                    'does not contain unique items.'.format(
                        args.expressions_type, args.norm_expressions_type)))
            sys.exit(1)

    res = resdk.Resolwe()

    feature_dict = {}
    df = parse_expression_file(args.expressions, args.expressions_type)

    # Get a list of feature IDs
    input_features = df['FEATURE_ID'].tolist()

    # Split feature IDs into chunks with max size of 10000 elements
    features_sublists = [
        input_features[i:i + CHUNK_SIZE]
        for i in range(0, len(input_features), CHUNK_SIZE)
    ]

    # Fetch features from KB and add them to {feature_id: feature_name} mapping dict
    for fsublist in features_sublists:
        features = res.feature.filter(source=args.source_db,
                                      species=args.species,
                                      feature_id=fsublist)
        feature_dict.update({f.feature_id: f.name for f in features})

    # Map gene symbols to feature IDs
    df['GENE_SYMBOL'] = df['FEATURE_ID'].map(feature_dict)

    # Check if all of the input feature IDs could be mapped to the gene symbols
    if not all(f_id in feature_dict for f_id in input_features):
        print(
            warning(
                '{} feature(s) could not be mapped to the associated feature symbols.'
                .format(sum(df.isnull().values.ravel()))))

    # Merge additional expression files with the original data frame
    if args.norm_expressions and args.norm_expressions_type:
        for exp_file, exp_type in zip(args.norm_expressions,
                                      args.norm_expressions_type):
            exp_df = parse_expression_file(exp_file, exp_type)
            df = df.merge(exp_df, on='FEATURE_ID')

    # Reorder the columns in dataframe
    columns = ['FEATURE_ID', 'GENE_SYMBOL', args.expressions_type]
    if args.norm_expressions_type:
        columns = columns + args.norm_expressions_type
    df = df[columns]

    # Replace NaN values with empty string
    df.fillna('', inplace=True)

    # Write to file
    df.to_csv(args.output_name + '.txt.gz',
              header=True,
              index=False,
              sep='\t',
              compression='gzip')

    # Write to JSON
    df_dict = df.set_index('FEATURE_ID').to_dict(orient='index')
    with open(args.output_name + '.json', 'w') as f:
        json.dump({'genes': df_dict}, f, allow_nan=False)
import argparse

from pysam import VariantFile
from resolwe_runtime_utils import error, warning

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('vcf_file',
                    help="VCF file (can be compressed using gzip/bgzip).")
parser.add_argument('summary', help="Summary file to append to.")
args = parser.parse_args()

try:
    vcf = VariantFile(args.vcf_file)
except (OSError, ValueError) as error_msg:
    proc_error = 'Input VCF file does not exist or could not be correctly opened.'
    print(error(proc_error))
    raise ValueError(error_msg)

vcf_header = vcf.header
header_records = {record.key: record.value for record in vcf_header.records}

with open(args.summary, "a") as out_file:
    try:
        fasta_name = os.path.basename(header_records['reference'])
    except KeyError:
        fasta_name = ''
        print(
            warning(
                'Reference sequence (FASTA) name could not be recognized from the VCF header.'
            ))
"""

import argparse

import pandas as pd
from pandas.errors import EmptyDataError
from resolwe_runtime_utils import error

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-f", "--bed_file", help="Bed file.")
args = parser.parse_args()

try:
    df = pd.read_csv(args.bed_file, delimiter="\t", header=None, dtype=str)
except EmptyDataError:
    print(
        error(
            f"The input BED file {args.bed_file} is empty. Your analysis might "
            f"have failed to identify regions of interest (peaks, junctions, etc.)."
        ))
else:
    df.iloc[:, 4] = pd.to_numeric(df.iloc[:, 4]).round().astype(int)
    df.iloc[:, 4] = df.iloc[:, 4].clip(upper=1000)

    # if strand column exist replace '?' with '.'
    if len(df.columns) >= 6:
        df.iloc[:, 5] = df.iloc[:, 5].replace("?", ".")

    output_name = "_".join(["corrected", args.bed_file])
    df.to_csv(output_name, sep="\t", index=False, header=False)
Beispiel #28
0
 def error(self, *args):
     """Log error message."""
     report = resolwe_runtime_utils.error(' '.join([str(x) for x in args]))
     # TODO: Use the protocol to report progress.
     print(report)
Beispiel #29
0
#!/usr/bin/env python3
"""Check if sample names are unique."""
import argparse

from resolwe_runtime_utils import error, send_message

parser = argparse.ArgumentParser(
    description="Check if sample names are unique")
parser.add_argument("samples", help="All samples")
args = parser.parse_args()

samples = args.samples.split(",")

if len(samples) > len(set(samples)):
    send_message((error("Sample names must be unique.")))
import os

from pysam import VariantFile
from resolwe_runtime_utils import error, send_message, warning

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("vcf_file",
                    help="VCF file (can be compressed using gzip/bgzip).")
parser.add_argument("summary", help="Summary file to append to.")
args = parser.parse_args()

try:
    vcf = VariantFile(args.vcf_file)
except (OSError, ValueError) as error_msg:
    proc_error = "Input VCF file does not exist or could not be correctly opened."
    send_message(error(proc_error))
    raise ValueError(error_msg)

vcf_header = vcf.header
header_records = {record.key: record.value for record in vcf_header.records}

with open(args.summary, "a") as out_file:
    try:
        fasta_name = os.path.basename(header_records["reference"])
    except KeyError:
        fasta_name = ""
        send_message(
            warning(
                "Reference sequence (FASTA) name could not be recognized from the VCF header."
            ))
 def test_string(self):
     self.assertEqual(error('Some error'), '{"proc.error": "Some error"}')
Beispiel #32
0
        break

if args.c:
    x_axis = data.iloc[:, 8][::-1]
    y_axis = data.iloc[:, 6] - data.iloc[:, 7]
else:
    x_axis = data.iloc[:, 7][::-1]
    y_axis = data.iloc[:, 6]

n_sup_enh, rows = data[data.isSuper == 1].shape

chr_pos = data.CHROM.map(str) + ":" + data.START.map(
    str) + "-" + data.STOP.map(str)

if len(x_axis) != len(y_axis):
    send_message(error("Scatter plot error. len(x_axis) != len(y_axis)"))

if len(labels) > 0 and len(labels) != len(x_axis):
    send_message(error("Scatter plot error. len(labels) != len(x_axis)"))

data = {
    "points": {
        "x_axis": list(x_axis),
        "y_axis": list(y_axis),
        "items": labels
    },
    "annotations": [
        {
            "type": "line",
            "x1": 0,
            "y1": float(cutoff),
def main():
    """Invoke when run directly as a program."""
    args = parse_arguments()

    de_data = pd.read_csv(args.raw_file, sep="\t")
    de_data.rename(columns={"Unnamed: 0": "gene_id"}, inplace=True)
    de_data.fillna(value=1, inplace=True)
    columns = {}
    col_order = []

    # Make sure all listed numeric columns are valid numeric variables based
    # on a union of numeric column names from cuffdiff, edgeR, deseq2 and test
    # files.
    numeric_columns = [
        "baseMean",
        "log2FoldChange",
        "lfcSE",
        "stat",
        "pvalue",
        "padj",
        "value_1",
        "value_2",
        "log2(fold_change)",
        "test_stat",
        "p_value",
        "q_value",
        "logfc",
        "fdr",
        "stat",
        "logFC",
        "logCPM",
        "LR",
        "Pvalue",
        "FDR",
    ]
    de_columns = de_data.columns

    for column in numeric_columns:
        if column not in de_columns:
            continue

        if not is_numeric_dtype(de_data[column]):
            msg = (
                f"Column {column} is not numeric. Please make sure "
                f"that the input file has valid numeric values (i.e. "
                f"periods for decimal places)."
            )
            print(error(msg))
            raise ValueError(msg)

    if args.gene_id:
        if args.gene_id == "index":
            columns["gene_id"] = list(de_data.index.astype(str))
            col_order.append("gene_id")
        else:
            columns["gene_id"] = list(de_data[args.gene_id].astype(str))
            col_order.append("gene_id")

    if args.logfc:
        col = np.array(de_data[args.logfc])
        col[np.isinf(col)] = 0
        columns["logfc"] = list(col)
        col_order.append("logfc")

    if args.fdr:
        columns["fdr"] = list(de_data[args.fdr])
        col_order.append("fdr")

    if args.pvalue:
        columns["pvalue"] = list(de_data[args.pvalue])
        col_order.append("pvalue")

    if args.fwer:
        columns["fwer"] = list(de_data[args.fwer])
        col_order.append("fwer")

    if args.logodds:
        columns["logodds"] = list(de_data[args.logodds])
        col_order.append("logodds")

    if args.stat:
        columns["stat"] = list(de_data[args.stat])
        col_order.append("stat")

    with open(args.output_json, "w") as f:
        json.dump(columns, f, separators=(",", ":"), allow_nan=False)

    outdf = pd.DataFrame(columns)
    outdf = outdf[col_order]
    outdf.to_csv(args.output_file, sep="\t", index=False, compression="gzip")
Beispiel #34
0
 def error(self, *args):
     """Log error message."""
     report = resolwe_runtime_utils.error(' '.join([str(x) for x in args]))
     # TODO: Use the protocol to report progress.
     print(report)
Beispiel #35
0
            barcode, filename = "", ""

            if len(t) == 2:
                barcode, filename = t[0:2]

            if len(t) > 2 and isnum(t[0]):
                barcode, filename = t[1:3]

            barcode, filename = barcode.strip(), filename.strip()

            if barcode and filename:
                pool_maps[barcode] = filename

                if barcode_length > 0 and barcode_length != len(barcode):
                    send_message(
                        error("Barcodes should be of the same length."))
                    exit(1)
                else:
                    barcode_length = len(barcode)

for bar, _map in iteritems(pool_maps):
    print("{}: {}".format(bar, _map))


def read_multiplexed(reads1_file, reads2_file, barcodes_file, pool_maps,
                     progress_start):
    """Parse multiplexed file."""
    pool_name = reads1_file.split(".")[0]

    def nicename(a):
        return a.replace("#", "").replace("  ",
Beispiel #36
0
def set_error(msg):
    """Print error message and raise ValueError."""
    send_message(error(msg))
    raise ValueError(msg)
Beispiel #37
0
parser.add_argument('-g', '--genes', nargs='+', default=[], help='subset of gene ids')
parser.add_argument('-d', '--dstfunc', default='euclidean', help='distance function')
parser.add_argument('-l', '--linkage', default='average', help='clustering linkage function')
parser.add_argument('--filter', help="Filter genes with low expression", action="store_true")

args = parser.parse_args()

distance_map = {
    'spearman': lambda x, y: 1 - spearmanr(x, y).correlation,
    'pearson': lambda x, y: 1 - np.corrcoef(x, y)[0][1],
    'euclidean': 'euclidean'
}

if args.dstfunc not in distance_map:
    msg = "Invalid distance function {}".format(args.dstfunc)
    print(error(msg))
    raise ValueError(msg)

if args.linkage not in ['average', 'single', 'complete']:
    msg = "Invalid clustering linkage function {}".format(args.linkage)
    print(error(msg))
    raise ValueError(msg)

if not args.sampleids or len(args.sampleids) != len(args.sample_files):
    msg = "Number of sample ids must match the number of files"
    print(error(msg))
    raise ValueError(msg)

# read data
matrix = []
gene_subset = set(args.genes) if args.genes else None