Ejemplo n.º 1
0
def maybe_gzip_open(filename, *args, **kwargs):
    if filename.endswith('.gz'):
        return closing(_gzip_open(filename, *args, **kwargs))
    elif filename == '-':
        return sys.stdin
    else:
        return open(filename, *args, **kwargs)
Ejemplo n.º 2
0
def maybe_gzip_open(filename, *args, **kwargs):
    if filename.endswith('.gz'):
        return closing(_gzip_open(filename, *args, **kwargs))
    elif filename == '-':
        return sys.stdin
    else:
        return open(filename, *args, **kwargs)
def find_harm_cnvs_in_no_harm_patients(out_pred_w_cnv_file, cnv_dict):
    f = _gzip_open(out_pred_w_cnv_file, 'r')
    for line in f:
        line = line.rstrip()
        line_split = line.split()
        nw_numharm = int(line_split[18])
        cnv_name = line_split[6]
        predicted = line_split[2]
        cnv_res = cnv_dict.get(cnv_name, [])
        cnv_res_split = cnv_res.split()
        if len(cnv_res) > 0:
            chrom = cnv_res_split[0]
            start = cnv_res_split[3]
            end = cnv_res_split[4]
        else:
	    chrom = 'None'
            start = 'None'
            end = 'None'

        if nw_numharm == 0 and predicted == '2:HARMFUL':
            print '\t'.join(['%s:%s-%s' % (chrom, start, end), line, cnv_res])
    f.close()
Ejemplo n.º 4
0
def gzip_open(*args, **kwargs):
    return closing(_gzip_open(*args, **kwargs))
Ejemplo n.º 5
0
def maybe_gzip_open(filename, *args, **kwargs):
    if filename.lower().endswith(".gz"):
        return closing(_gzip_open(filename, *args, **kwargs))
    else:
        return open(filename, *args, **kwargs)
Ejemplo n.º 6
0
def maybe_gzip_open(filename, *args, **kwargs):
    if filename.lower().endswith('.gz'):
        return closing(_gzip_open(filename, *args, **kwargs))
    else:
        return open(filename, *args, **kwargs)
def random_out(iteration, info_dict, num_sets):

    if args.add_remaining == 'new':
        num_sets_w_new = num_sets + 1
    else:
        num_sets_w_new = num_sets

    out_dict_encoding = fill_out_set(num_sets_w_new, {'cnvbal':{}, 'cnvnbal':{}})
    
    iteration_counts = OrderedDict()
    iteration_file = {}
    for set_id in range(num_sets_w_new):
        iteration_counts[set_id] = OrderedDict()
        iteration_file[set_id] = OrderedDict()
        for bal_nbal in ['cnvbal', 'cnvnbal']:
            iteration_counts[set_id][bal_nbal] = OrderedDict()
            iteration_counts[set_id][bal_nbal]['PHENOTYPE'] = OrderedDict()
            iteration_counts[set_id][bal_nbal]['PHENOTYPE']['HARMFUL'] = set()
            iteration_counts[set_id][bal_nbal]['PHENOTYPE']['BENIGN'] = set()
            iteration_counts[set_id][bal_nbal]['PATIENT'] = OrderedDict()
            iteration_counts[set_id][bal_nbal]['PATIENT']['HARMFUL'] = set()
            iteration_counts[set_id][bal_nbal]['PATIENT']['BENIGN'] = set()
            iteration_counts[set_id][bal_nbal]['CNV'] = OrderedDict()
            iteration_counts[set_id][bal_nbal]['CNV']['HARMFUL'] = set()
            iteration_counts[set_id][bal_nbal]['CNV']['BENIGN'] = set()
            iteration_counts[set_id][bal_nbal]['GENE'] = OrderedDict()
            iteration_counts[set_id][bal_nbal]['GENE']['HARMFUL'] = set()
            iteration_counts[set_id][bal_nbal]['GENE']['BENIGN'] = set()
            iteration_file[set_id][bal_nbal] = _gzip_open('%s_%s_%s_%s.arff.gz' % (set_id, args.out_file, iteration, bal_nbal), 'w')

    clone_dict = expand_info_dict(info_dict)
    # clone_dict = copy.deepcopy(info_dict)

    phenotype, out_genes_cnv_bal, out_genes_cnv_nbal = random_info_dict(clone_dict, args.balance_test, num_sets)
    i = 0

    while phenotype:
        # print query_info_dict(clone_dict)
        for bal_nbal, out_genes in zip(['cnvbal', 'cnvnbal'], [out_genes_cnv_bal, out_genes_cnv_nbal]):
            for label, gene_list in out_genes.iteritems():
                weka_gene_i = 0
                for weka_gene in gene_list:
                    (patient, cnv, case_control, dup, hposim, original_gene, hpo_term, infomax) = weka_gene.split('\t')[2:10]
                    iteration_counts[label][bal_nbal]['PHENOTYPE'][case_control].add(hpo_term)
                    iteration_counts[label][bal_nbal]['PATIENT'][case_control].add(patient)
                    iteration_counts[label][bal_nbal]['CNV'][case_control].add(cnv)
                    iteration_counts[label][bal_nbal]['GENE'][case_control].add('%s_%s_%s' % (cnv, original_gene, weka_gene_i))
                    # print hpo_term, patient, cnv, original_gene
                    if not weka_gene in out_dict_encoding[label][bal_nbal]:
                        out_dict_encoding[label][bal_nbal][weka_gene] = 0
                    out_dict_encoding[label][bal_nbal][weka_gene] += 1
                    weka_gene_i += 1


        phenotype, out_genes_cnv_bal, out_genes_cnv_nbal = random_info_dict(clone_dict, args.balance_test, num_sets)
        # print phenotype, query_info_dict(clone_dict)
        i += 1

    for bal_nbal in ['cnvbal', 'cnvnbal']:
        for label in range(num_sets_w_new):
            # num_count -> arff weight

            # TODO: maybe should be at randomization level?
            for gene_line, num_count in out_dict_encoding[label][bal_nbal].iteritems():
                if args.weighted_gene_duplication == 'sim':
                    exponator = 2
                else:
                    exponator = 1

                print >> iteration_file[label][bal_nbal], gene_line.replace('{}', '{%s}' % num_count**exponator)
                # TODO: use this to duplicate lines instead of weighing
                # for i in range(num_count):
                #     print >> iteration_file[label][bal_nbal], gene_line.replace('{}', '{%s}' % 1)
            iteration_file[label][bal_nbal].close()

    print 'iteration: %s, %s' % (iteration, i)
    for train_test_key, train_test_val in iteration_counts.iteritems():
        for subset_key, subset_val in train_test_val.iteritems():
            for case_control_key, case_control_val in subset_val.iteritems():
                for asdf_key, asdf_val in case_control_val.iteritems():
                    print train_test_key, subset_key, case_control_key, asdf_key, len(asdf_val)
    sys.stdout.flush()
Ejemplo n.º 8
0
def gzip_open(*args, **kwargs):
    return closing(_gzip_open(*args, **kwargs))
Ejemplo n.º 9
0
import re
from gzip import open as _gzip_open

parser = argparse.ArgumentParser(description='Remove certain features from an arff file.')

parser.add_argument('--remove', '-R', help="The feature indexes to remove.")
parser.add_argument('--input', '-i', help="Input arff file.", required=True)
parser.add_argument('--output', '-o', help="Output arff file. If none, use stdout.")
parser.add_argument('--debug', '-d', help="Debug", action='store_true')

args = parser.parse_args()

if args.output == None:
    output_file = sys.stdout
else:
    output_file = _gzip_open(args.output, 'w')

intervals = args.remove.split(',')

remove_list = []
for interval in intervals:
    if '-' in interval:
        (start, end) = interval.split('-')
        start = int(start)
        end = int(end)
        remove_list += range(start-1, end)
    else:
    	remove_list.append(int(interval)-1)

remove_list_mone = [remove + 1 for remove in remove_list]
parser.add_argument('similarity_rank_cutoff', type=int)
parser.add_argument('balance_test', choices=['bt_none', 'bt_remaining', 'bt_patient', 'bt_ptrem'], \
                    help="none for not balancing, remaining to balance and keep the remaining, patient to balance by patient, ptrem to balance by patient and keep the remainging.")
parser.add_argument('--balance_genes', '-b', action='store_true')
parser.add_argument('--debug', '-d', help="Debug.", action='store_true')

args = parser.parse_args()
if args.neighbour_weight_function == "0":
    anwf = "1"
else:
    anwf = args.neighbour_weight_function
fn_str = 'lambda x: %s' % args.neighbour_weight_function
print fn_str
NEIGHBOUR_WEIGHT_FUNCTION = eval(fn_str)

f = _gzip_open(args.weka_file, 'r')
line = f.readline().split('\t')
for i in range(len(line)):
    if '{' in line[i]:
        cutoff_index = i
        break
f.close()

prev = None
current = None

# out_gene_file = open(args.out_gene_file, 'w')

log_line_original_gene = []
log_score_original_gene = []
log_line_cnv = []
def gzip_open(*args, **kwargs):
    return _gzip_open(*args, **kwargs)
                "IEA",
            },
        ),
    ]
)

RESERVED_FIELD_NUM = 7
if args.neighbour_weight_function == "0":
    anwf = "1"
else:
    anwf = args.neighbour_weight_function
fn_str = "lambda x: %s" % anwf
print fn_str
NEIGHBOUR_WEIGHT_FUNCTION = eval(fn_str)

output_file = _gzip_open(args.output_file, "w")


def timer(s):
    global start
    global totalstart
    totalelapsed = time.clock() - totalstart
    elapsed = time.clock() - start
    start = time.clock()
    print "%s...%s...%s" % (s, elapsed, totalelapsed)
    sys.stdout.flush()


def load_gene_net():
    iin = open(args.gene_network_file, "r")
                    	namespace_dict[current_ont] = current_namespace
    f.close()
    return ont_dict, namespace_dict

# file to create hp <-> hp layer
hp2parents, _ = parse_ontology(args.hp_file)

# file to create go <-> go layer
go2parents, namespace_dict = parse_ontology(args.go_file)

if args.j48_graph_file == '_':
    print json.dumps(namespace_dict, indent=4)

if not args.j48_graph_file == '_':

    f = _gzip_open(args.j48_graph_file, 'r')
    for line in f:
        line = line.strip()

        if not len(line) > 0:
            continue
        if not line[0] == 'N':
            continue

        if '->' in line:
            n1 = uniq_id + "_" + line.split('->')[0]
            n2 = uniq_id + "_" + line.split('->')[1].split()[0]
            label = line.split('"')[1]

            label = label.replace('>', 'gt')
            label = label.replace('<', 'lt')
Ejemplo n.º 14
0
from gzip import open as _gzip_open
from collections import OrderedDict

parser = argparse.ArgumentParser(description="Create a CNV arff file from a gene arff file.")

parser.add_argument('cnvs_w_dgv_overlap', help="DGV annotation file.")
parser.add_argument('out_pt', help="Results pivot table file.  Summary of harmful/benign classification of genes per cnv.")
parser.add_argument('out_pred_w_cnv', help="Weka predictions with cnv annotations.")
parser.add_argument('--incorrect', '-i', help="Only output incorrect.", action='store_true')
parser.add_argument('--debug', '-d', help="Debug", action='store_true')

args = parser.parse_args()

cnv_dict = OrderedDict()
# arff
f = _gzip_open(args.cnvs_w_dgv_overlap, 'r')
for line in f:
    (cnv, length, num_overlap, metric_overlap, sample, phenotype) = line.rstrip().split()
    cnv_dict[cnv] = OrderedDict([['length', length],
                                  ['num_overlap', num_overlap],
                                  ['metric_overlap', metric_overlap],
                                  ['sample', sample],
                                  ['phenotype', phenotype],
                                  ['maxgene', 'NoCnv'],
                                  ['conf', -1],
                                  ['simscore', -1],
                                  ])
f.close()

# assign the max gene to look at for each cnv
f = _gzip_open(args.out_pred_w_cnv)
Ejemplo n.º 15
0
import sys
import argparse
from pandas import *
import itertools
import copy
from gzip import open as _gzip_open

parser = argparse.ArgumentParser(description='Calculate the DGV overlap feature for each CNV.')

parser.add_argument('cnvs_w_dgv_bed', help="Input CNVs/DGV overlap bed file.")
parser.add_argument('--debug', '-d', help="Debug.", action='store_true')

args = parser.parse_args()

# arff
f = _gzip_open(args.cnvs_w_dgv_bed, 'r')
prev_cnv = None
cnv_name = None
overlap_log = []

line = f.readline()
keep_looping = True

while keep_looping:

    if not line:
        keep_looping = False
        cnv_name = None
    else:
        line = line.strip().split('\t')
        prev_cnv = cnv_name