Ejemplo n.º 1
0
def main():
    """Invoked when run directly as a program."""
    args = parse_arguments()

    terms = defaultdict(list)

    with gzopen(args.gaf_file) as gaf:
        for line in gaf:
            if line.startswith('!'):
                continue
            go_data = line.strip().split('\t')
            terms[go_data[4]].append(go_data[1])

    for go_term in terms:
        outfile = '{}.tab'.format(str(go_term).replace(':', ''))
        with open(outfile, "wt") as f:
            writer = csv.writer(f, delimiter=str('\t'), lineterminator='\n')
            for gene in terms[go_term]:
                writer.writerow([gene])

            print(export(outfile))
            d = {
                'process': 'upload-geneset',
                'input': {
                    'src': outfile,
                    'source': args.source
                }
            }

        print('run {}'.format(json.dumps(d, separators=(',', ':'))))

    print('{{"num_genesets":{}}}'.format(len(terms)))
Ejemplo n.º 2
0
def main():
    """Invoked when run directly as a program."""
    args = parse_arguments()

    terms = defaultdict(list)

    with gzopen(args.gaf_file) as gaf:
        for line in gaf:
            if line.startswith('!'):
                continue
            go_data = line.strip().split('\t')
            terms[go_data[4]].append(go_data[1])

    for go_term in terms:
        outfile = '{}.tab'.format(str(go_term).replace(':', ''))
        with open(outfile, "wt") as f:
            writer = csv.writer(f, delimiter=str('\t'), lineterminator='\n')
            for gene in terms[go_term]:
                writer.writerow([gene])

            print(export(outfile))
            d = {
                'process': 'upload-geneset',
                'input': {
                    'src': outfile,
                    'source': args.source
                }
            }

        print('run {}'.format(json.dumps(d, separators=(',', ':'))))

    print('{{"num_genesets":{}}}'.format(len(terms)))
Ejemplo n.º 3
0
def get_measured(sample_exp, sample_name, exp_type, only_zero=False, only_nonzero=False, log2=False):
    """Get measured expression values.

    If specified, also log2 transform and only keep nonzero values.
    """
    handle = utils.gzopen(sample_exp)
    exp = pd.read_csv(handle, delimiter='\t', index_col='Gene')
    exp = exp.loc[:, 'Expression'].astype('float')

    assert not (only_zero and only_nonzero)
    if only_zero:
        exp = exp[exp == 0]
    elif only_nonzero:
        exp = exp.iloc[exp.nonzero()[0]]

    if log2:
        exp = np.log2(exp)

    return exp
Ejemplo n.º 4
0
def get_measured(
    sample_exp, sample_name, exp_type, only_zero=False, only_nonzero=False, log2=False
):
    """Get measured expression values.

    If specified, also log2 transform and only keep nonzero values.
    """
    handle = utils.gzopen(sample_exp)
    exp = pd.read_csv(handle, delimiter="\t", index_col="Gene")
    exp = exp.loc[:, "Expression"].astype("float")

    assert not (only_zero and only_nonzero)
    if only_zero:
        exp = exp[exp == 0]
    elif only_nonzero:
        exp = exp.iloc[exp.nonzero()[0]]

    if log2:
        exp = np.log2(exp)

    return exp
Ejemplo n.º 5
0
args = parser.parse_args()

out_file = open(args.out, "w")

header = ["Gene"]
geneset = []
exp = defaultdict(list)

experiments = iter(args.experiments)

for etc in args.files:
    if not os.path.isfile(etc):
        exit(1)

    with utils.gzopen(etc) as f:
        etc_data = json.load(f)
        x = next(experiments)
        header = header + [
            x + ' - ' + tp + 'h'
            for tp in map(str, etc_data["etc"]["timePoints"])
        ]
        gn = set([g for g in etc_data["etc"]["genes"]])

        geneset.append(gn)

        for g in gn:
            exp[g].append(etc_data["etc"]["genes"][g])

genes = set.intersection(*geneset)
Ejemplo n.º 6
0
import csv
import re
import sys

import utils


parser = argparse.ArgumentParser(
    description='Create BEDGRAPH coverage file for a tab file w.r.t. given GFF3 annotations.')
parser.add_argument('--tab', dest='tab_file', help='Tab file')
parser.add_argument('--tab-coverage-col', dest='tab_col_val', help='Tab column with coverage value')
parser.add_argument('--gff3', dest='gff3_file', help='GFF3 file')
args = parser.parse_args()

# Fetch gene ids and their expressions from tab file
with utils.gzopen(args.tab_file) as f:
    rdr = csv.reader(f, delimiter='\t')
    rdr.next()  # skip header
    tab_vals = {row[0]: float(row[int(args.tab_col_val)]) for row in rdr}

genes = {}
# Fetch gene regions and chromosomes they belong to
with open(args.gff3_file, 'r') as f:
    rdr = csv.reader(f, delimiter='\t')
    gene_id_regex = re.compile(r'ID=([A-Za-z0-9_]+);')
    for i, row in enumerate(rdr):
        # skip GFF3 headers
        if row[0][0:2] == '##':
            continue
        # skip if not mRNA
        if row[2] != 'mRNA' or row[2] != 'transcript':
    print('{"rc":"1"}')
    exit(1)

if not (args.input and os.path.isfile(args.input)):
    print('{"rc":"1"}')
    exit(1)


def isfloat(value):
    """Check if value is float."""
    try:
        float(value)
        return True
    except ValueError:
        return False


with utils.gzopen(args.input) as f:
    # Split lines by tabs
    # Ignore lines without a number in second column
    # Build a dictionary of gene-expression pairs
    exp = {'genes': {gene_exp[0]: float(gene_exp[1]) for
                     gene_exp in (l.split('\t') for l in f) if
                     len(gene_exp) == 2 and isfloat(gene_exp[1])}}

if args.output:
    with open(args.output, 'w') as f:
        json.dump(exp, f)
else:
    print('{"exp_json":%s}' % json.dumps(exp, separators=(',', ':')))
Ejemplo n.º 8
0
if not (args.input and os.path.isfile(args.input)):
    print('{"rc":"1"}')
    exit(1)


def isfloat(value):
    """Check if value is float."""
    try:
        float(value)
        return True
    except ValueError:
        return False


with utils.gzopen(args.input) as f:
    # Split lines by tabs
    # Ignore lines without a number in second column
    # Build a dictionary of gene-expression pairs
    exp = {
        'genes': {
            utils.escape_mongokey(gene_exp[0]): float(gene_exp[1])
            for gene_exp in (l.split('\t') for l in f)
            if len(gene_exp) == 2 and isfloat(gene_exp[1])
        }
    }

if args.output:
    with open(args.output, 'w') as f:
        json.dump(exp, f)
else:
Ejemplo n.º 9
0
if args.dstfunc not in distance_map:
    raise ValueError("Invalid distance function {}".format(args.dstfunc))

if args.linkage not in linkage_map:
    raise ValueError("Invalid clustering linkage function {}".format(
        args.linkage))

if not args.expids or len(args.expids) != len(args.etc_files):
    raise ValueError("Number of experiment ids must match the number of files")

etcs = []
timepoints = set()

# read data
for i, fname in enumerate(args.etc_files):
    etcjson = json.load(utils.gzopen(fname))
    tps = etcjson['etc']['timePoints']
    expid = args.expids[i]

    if not all(tps[i] <= tps[i + 1] for i in range(len(tps) - 1)):
        raise ValueError("Timepoints should be ordered")

    etc = {'genes': {}, 'experiment': expid, 'timePoints': np.array(tps)}
    timepoints.update(tps)

    for gene in args.genes:
        if gene in etcjson['etc']['genes']:
            etc['genes'][gene] = np.array(etcjson['etc']['genes'][gene])

    etcs.append(etc)
Ejemplo n.º 10
0
# Main
split = os.path.split
ontology_id = split(split(args.ontology)[0])[1]
annotation_id = split(split(args.annotation)[0])[1]
annotation_cache = os.path.join('/tmp', 'GO_' + ontology_id + '_Annotation_' + annotation_id)

if os.path.isfile(annotation_cache):
    try:
        with open(annotation_cache, 'rb') as fd:
            annotations = pickle.load(fd)
    except:
        os.remove(annotation_cache)
        raise

else:
    with utils.gzopen(args.ontology) as fd:
        ontology = Ontology(fd)

    annotations = Annotations(file=args.annotation, ontology=ontology)

    with open(annotation_cache, 'wb') as fd:
        pickle.dump(annotations, fd, -1)

translator = {a.DB_Object_Symbol: a.DB_Object_ID for a in annotations}

orth = {}
genes = set()

if args.orthologues:
    orth = dict(l.strip().split("\t") for l in utils.gzopen(args.orthologues))
Ejemplo n.º 11
0
if args.dstfunc not in distance_map:
    raise ValueError("Invalid distance function {}".format(args.dstfunc))

if args.linkage not in linkage_map:
    raise ValueError("Invalid clustering linkage function {}".format(args.linkage))

if not args.expids or len(args.expids) != len(args.etc_files):
    raise ValueError("Number of experiment ids must match the number of files")

etcs = []
timepoints = set()

# read data
for i, fname in enumerate(args.etc_files):
    etcjson = json.load(utils.gzopen(fname))
    tps = etcjson['etc']['timePoints']
    expid = args.expids[i]

    if not all(tps[i] <= tps[i + 1] for i in xrange(len(tps) - 1)):
        raise ValueError("Timepoints should be ordered")

    etc = {'genes': {}, 'experiment': expid, 'timePoints': np.array(tps)}
    timepoints.update(tps)

    for gene in args.genes:
        if gene in etcjson['etc']['genes']:
            etc['genes'][gene] = np.array(etcjson['etc']['genes'][gene])

    etcs.append(etc)
Ejemplo n.º 12
0
import utils


if len(sys.argv) != 2:
    print '{"rc":"1"}'
    exit(1)

fname = sys.argv[1]

if not os.path.isfile(fname):
    print '{"rc":"1"}'
    exit(1)


def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

with utils.gzopen(fname) as f:
    # Split lines by tabs
    # Ignore lines without a number in second column
    # Build a dictionary of gene-expression pairs
    exp = {'genes': {utils.escape_mongokey(gene_exp[0]): float(gene_exp[1]) for
                     gene_exp in (l.split('\t') for l in f) if
                     len(gene_exp) == 2 and isfloat(gene_exp[1])}}

print '{"exp_json":%s}' % json.dumps(exp, separators=(',', ':'))
Ejemplo n.º 13
0
args = parser.parse_args()

out_file = open(args.out, "w")

header = ["Gene"]
geneset = []
exp = defaultdict(list)

experiments = iter(args.experiments)

for etc in args.files:
    if not os.path.isfile(etc):
        exit(1)

    with utils.gzopen(etc) as f:
        etc_data = json.load(f)
        x = experiments.next()
        header = header + [x + ' - ' + tp + 'h' for tp in map(str, etc_data["etc"]["timePoints"])]
        gn = set([g for g in etc_data["etc"]["genes"]])

        geneset.append(gn)

        for g in gn:
            exp[g].append(etc_data["etc"]["genes"][g])

genes = set.intersection(*geneset)

if args.genes:
    genes = genes.intersection(args.genes)
Ejemplo n.º 14
0
# pylint: disable=missing-docstring,invalid-name
# XXX: Refactor to a comand line tool and remove pylint disable
"""Change genes names to orthologues ones."""
from __future__ import absolute_import, division, print_function

import argparse
import csv

import utils


parser = argparse.ArgumentParser(description='Change genes names to orthologues ones.')

parser.add_argument('ortholog_file', help='file with orthologues')
parser.add_argument('genes', nargs='*', help='genes names')

args = parser.parse_args()

orthologues = {}

with utils.gzopen(args.ortholog_file) as ortholog_tsv:
    for ortholog in csv.reader(ortholog_tsv, delimiter='\t'):
        orthologues[ortholog[0]] = ortholog[1]

genes = args.genes
for i, gene in enumerate(genes):
    if gene in orthologues:
        genes[i] = orthologues[gene]

print(' '.join(genes))
Ejemplo n.º 15
0

def spearman(x, y):
    """Compute Spearman's rank."""
    return spearmanr(x, y)[0]


# 2nd argument: True if higher value means better score
distance_map = {
    'euclidean': [euclidian, False],
    'pearson': [pearson, True],
    'spearman': [spearman, True]
}

search_gene = args.gene
file_handler = utils.gzopen(args.etc_file)
expressions = json.load(file_handler)
file_handler.close()
search_f, rev_sort = distance_map[args.dstfunc]

if args.dstfunc not in distance_map:
    raise ValueError("Invalid distance function {}".format(args.dstfunc))

search_gene_expression = expressions['etc']['genes'][search_gene]

similarity = [{
    'gene':
    gene,
    'distance':
    search_f(expressions['etc']['genes'][gene], search_gene_expression)
} for gene in expressions['etc']['genes'] if gene != search_gene]
Ejemplo n.º 16
0
    return pearsonr(x, y)[0]


def spearman(x, y):
    return spearmanr(x, y)[0]


# 2nd argument: True if higher value means better score
distance_map = {
    'euclidean': [euclidian, False],
    'pearson': [pearson, True],
    'spearman': [spearman, True]
}

search_gene = args.gene
file_handler = utils.gzopen(args.etc_file)
expressions = json.load(file_handler)
file_handler.close()
search_f, rev_sort = distance_map[args.dstfunc]

if args.dstfunc not in distance_map:
    raise ValueError("Invalid distance function {}".format(args.dstfunc))

search_gene_expression = expressions['etc']['genes'][search_gene]

similarity = [{'gene': gene, 'distance': search_f(expressions['etc']['genes'][gene], search_gene_expression)}
              for gene in expressions['etc']['genes'] if gene != search_gene]


similarity = filter(lambda x: not math.isnan(x['distance']), similarity)
similarity.sort(reverse=rev_sort, key=lambda x: x['distance'])
Ejemplo n.º 17
0
parser = argparse.ArgumentParser(description='Median gene expressions of multiple experiments.')
parser.add_argument('files', nargs='*', help='expression files')
parser.add_argument('--name', help='expression column name')
parser.add_argument('--out', help='output file')

args = parser.parse_args()

expressions = collections.defaultdict(list)
for f in args.files:
    if not os.path.isfile(f):
        exit(1)

    base, ext = os.path.splitext(f)
    delimiter = ';' if ext == '.csv' else '\t'

    with utils.gzopen(f) as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter)
        header = reader.next()
        for gene, exp in reader:
            expressions[gene].append(float(exp))

genes = sorted(expressions.keys())
medians = [np.median(expressions[g]) for g in genes]

fhandler = open(args.out, 'w') if args.out else sys.stdout

fhandler.write('Gene\t{}Median\n'.format(args.name if args.name else ''))
for gene, med in zip(genes, medians):
    fhandler.write('{}\t{:.6f}\n'.format(gene, med))
    args = parser.parse_args()

    data = {}

    for (exp_file, build, species, exp_type,
         sample_name) in zip(args.file_path, args.build, args.species,
                             args.exp_type, args.sample_names):
        data.setdefault((build, species, exp_type),
                        []).append([exp_file, sample_name])

    for (build, species, exp_type), data_values in data.items():
        df = pd.DataFrame(np.nan, index=[], columns=[])
        header = []
        for (exp_file, sample_name) in data_values:
            header.append(sample_name)
            with utils.gzopen(exp_file) as csvfile:
                reader = pd.read_csv(csvfile,
                                     index_col='Gene',
                                     delimiter='\t',
                                     dtype=str)
                df = pd.concat([df, reader], axis=1)

        # Add numbers to duplicated sample names.
        counts = Counter(header)
        for sample_name, num in counts.items():
            if num > 1:
                for suffix in range(1, num + 1):
                    header[header.index(sample_name)] = '{}_{}'.format(
                        sample_name, suffix)
        df.columns = header
        name = '_'.join([species, build, exp_type, 'all_expressions.txt'])
Ejemplo n.º 19
0
#     raise ValueError("Number of experiments must match the number of files")

genes = set()
expressions = []
headers = []
op = set.intersection if args.intersection else set.union
offset = 0

for f in args.files:
    if not os.path.isfile(f):
        exit(1)

    base, ext = os.path.splitext(f)
    delimiter = ';' if ext == '.csv' else '\t'

    with utils.gzopen(f) as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter)
        header = reader.next()[1:]
        headers.append(
            args.experiments[offset:offset +
                             len(header)] if args.experiments else header)
        offset += len(headers[-1])
        expressions.append(dict((r[0], r[1:]) for r in reader))
        genes = set(
            expressions[-1].keys()) if args.intersection and not genes else op(
                genes, expressions[-1].keys())

if args.genes:
    genes = genes.intersection(args.genes)

genes = sorted(genes)
Ejemplo n.º 20
0
# pylint: disable=missing-docstring,invalid-name
# XXX: Refactor to a comand line tool and remove pylint disable
"""Change genes names to orthologues ones."""
from __future__ import absolute_import, division, print_function

import argparse
import csv

import utils

parser = argparse.ArgumentParser(
    description='Change genes names to orthologues ones.')

parser.add_argument('ortholog_file', help='file with orthologues')
parser.add_argument('genes', nargs='*', help='genes names')

args = parser.parse_args()

orthologues = {}

with utils.gzopen(args.ortholog_file) as ortholog_tsv:
    for ortholog in csv.reader(ortholog_tsv, delimiter='\t'):
        orthologues[ortholog[0]] = ortholog[1]

genes = args.genes
for i, gene in enumerate(genes):
    if gene in orthologues:
        genes[i] = orthologues[gene]

print(' '.join(genes))