Example #1
0
def gather_files(path, replicates, name):
    res = defaultdict(list)
    files = [x for x in get_only_files(path) if "annotated" in x]
    files.sort()

    for f in files:
        for r in replicates:
            if (os.path.basename(f).startswith(r)):
                res[r].append(f)
    res[name] = files
    return res
Example #2
0
    type=int,
    help="Flank length of the peak area around its highest point")
parser.add_argument('--outdir',
                    required=True,
                    nargs='?',
                    type=str,
                    help="Path to the output directory")
args = parser.parse_args()


def check_interval(interval, mincov):
    return all(
        [float(x) > mincov for x in interval.attrs['topcoverage'].split(",")])


for path in get_only_files(args.path):
    if (path.endswith('gff') or path.endswith('bed')):
        bedtool = BedTool(path)
        if (len(bedtool)):
            with open(os.path.join(args.outdir, os.path.basename(path)),
                      'w') as f:
                for interval in bedtool:
                    if (check_interval(interval, args.mincov)):
                        center = int(interval.name)
                        f.write(
                            str(
                                Interval(interval.chrom,
                                         center - args.flank,
                                         center + args.flank,
                                         name=interval.name,
                                         strand=interval.strand,
Example #3
0
    '--log',
    nargs='?',
    default=False,
    const=True,
    type=bool,
    help="If set, log2 transformation is apllied to the coverage")
parser.add_argument('--labels',
                    nargs='?',
                    required=True,
                    type=str,
                    help="Path to the file with peak labels")
args = parser.parse_args()

NAME_ORDER = ['glu_wt', 'glu_ko_cyab', 'ace_glu_wt', 'ace_glu_ko_cyab']

files = [x for x in get_only_files(args.path) if "normalized" in x]
name2files = defaultdict(list)
for f in files:
    name = "_".join(os.path.basename(f).split("_")[:-1])
    name2files[name].append(f)

size = len(name2files)
name2coverage = {}
for name, local_files in name2files.items():
    local_coverages = [list(coverage2dict(f).values())[0] for f in local_files]
    averaged_coverage = np.mean(local_coverages, axis=0)
    if (args.log):
        averaged_coverage = [np.log2(x + 1) for x in averaged_coverage]
    name2coverage[name] = averaged_coverage
    length_coverage = len(averaged_coverage)
Example #4
0
                for x in genes if x.strand == '+']
    tss_list.extend([(x,
                      fasta[x.chrom].seq[x.end - downstream:x.end + upstream])
                     for x in genes if x.strand == '-'])
    for tss, seq in tss_list:
        for adstart, adend, lseq, at_fraction, gc_count in get_at_rich_stretches(
                seq, 5, 20, 4, 0.4):
            stretches.append(
                (tss.chrom, tss.strand, tss.start + adstart, tss.start + adend,
                 lseq, stretch_score(at_fraction, adend, adstart), seq))

    stretches.sort(key=lambda x: x[5], reverse=True)
    return stretches[:top]


raw_files = sorted(get_only_files(args.path))
files_list = [(os.path.basename(x[1][0]).split(".")[0], x[1][0], x[1][1])
              for x in enumerate(zip(raw_files, raw_files[1:]))
              if x[0] % 2 == 0]
#sys.exit()

for name, fasta_path, gff_path in files_list[:]:
    genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
    genes = BedTool(gff_path)
    with open(os.path.join(args.outdir, "%s.at_stretches.tsv" % name),
              'w') as f:
        f.write("chromosome\tstrand\tstart\tstop\tseq\tscore\tall_tss_seq\n")
        for el in get_tss_at_contents(genes, genome, args.upstream,
                                      args.downstream, args.top):
            f.write("%s\t%s\t%d\t%d\t%s\t%1.2f\t%s\n" % el)
Example #5
0
parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to the folder with reads");
parser.add_argument('--table', nargs = '?', type = os.path.abspath, required = True, help = "Path to the sample table, tsv format");
parser.add_argument('--outdir', nargs = '?', type = str, required = True, help = "Path to the output directory");
#parser.add_argument('--paired', nargs = '?', default = False, const=True, type = bool, help = "If set, reads are assumed to be paired-end")
#parser.add_argument('--table', nargs = '?', type = os.path.abspath, help = "Path to the table which connects the read file names to the meaningful names");
args = parser.parse_args();


sample2type = {};

with open(args.table) as f:
    for l in f:
        a = l.strip().split("\t");
        time = a[1].replace(" ", "").replace(".", "")
        sample2type[a[2]] = a[0], time, "chap"
        sample2type[a[3]] = a[0], time, "control"


for cond in set([x[0] for x in sample2type.values()]):
    for type_ in ('chap', 'control'):
        path = os.path.join(args.outdir, "%s_%s" % (cond, type_))
        Path(path).mkdir(parents=True, exist_ok=True)
#sys.exit()

for f in get_only_files(args.path):
    if(f.endswith('fastq')):
        name, mate, _ = os.path.basename(f).split(".")
        cond, time, type_ = sample2type[name]
        path = os.path.join(args.outdir, "%s_%s" % (cond, type_), "%s.%s.fastq" % (time, mate))
        copyfile(f, path)
Example #6
0
parser = argparse.ArgumentParser(description='Detects nondepleted rRNA regions beased on the genomic coverage');
parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to the coverage folder");
parser.add_argument('--rrna', nargs = '?', required=True, type = str, help = "Path to the rRNA, gff file");
parser.add_argument('--genome', nargs = '?', required=True, type = str, help = "Path to the genome, fasta file");
parser.add_argument('--minfraction', nargs = '?', default=1, type = float, help = "Minimal required fraction/multiplier (of the mean rRNA coverage) for a particular position to be counted as nondepleted");
parser.add_argument('--minlength', nargs = '?', default=20, type = float, help = "Minimal required length of non-depleted regions");
parser.add_argument('--outtype', nargs = '?', choices=['fa', 'tsv'], default='tsv', type = str, help = "Type of the output file, fasta or tsv");
args = parser.parse_args();

strand_conv = {'plus': '+', 'minus': '-'} 

genome = SeqIO.to_dict(SeqIO.parse(args.genome,'fasta'))

#rrna = BedTool(args.rrna);
files = get_only_files(args.path)
strand2coverage = {}
for f in files[:]:
    strand = strand_conv[f.split(".")[-2]]
    for chrom, cov in coverage2dict(f, cpos=2).items():
        if( (chrom, strand) in strand2coverage):
            strand2coverage[(chrom, strand)] += cov
        else:
            strand2coverage[(chrom, strand)] = cov
            

total_sum = sum([sum(x) for x in strand2coverage.values()])
#print(total_sum)

rrna2coverage =[];
for interval in BedTool(args.rrna):
Example #7
0
    mlist.append('clean:\n\techo "nothing to clean."\n')

    return "\n\n".join(mlist)


seq_package = os.path.join(args.package, 'sequencing')

sample2name = {}
with open(args.table) as f:
    for l in f:
        a = l.strip().split("\t")
        sample2name[a[0]] = a[1:]

if (args.paired):
    name2sample = defaultdict(lambda: [None] * 2)
    for sample in get_only_files(args.path):
        a = sample2name.get(os.path.basename(sample))
        if (a):
            name2sample[a[0]][int(a[1]) - 1] = sample
        else:
            sys.stderr.write(
                "Sample %s was not found in the provided table\n" % sample)
else:
    name2sample = {}
    for sample in get_only_files(args.path):
        a = sample2name.get(os.path.basename(sample))
        if (a):
            name2sample[a[0]] = sample
        else:
            sys.stderr.write(
                "Sample %s was not found in the provided table\n" % sample)
Example #8
0
    next(f)
    for l in f:
        a = l.strip().split("\t")
        start = int(a[1])
        stop = int(a[2])
        temp = ["%1.1f" % float(x) if x != 'None' else '0.0' for x in a[9:14]]
        interval = Interval("NC_003450.3", start, stop,
                            "_".join(a[5:7] + temp), '0', '+')
        #print(interval.name)
        reference.append(interval)

reference = BedTool(reference)

overlap_counter = defaultdict(int)
for folder in args.replicates:
    files = [x for x in get_only_files(folder) if "filtered" in x]
    replicates_list = [BedTool(x) for x in files]
    for r in reference.intersect(b=replicates_list, u=True):
        overlap_counter[(r.name)] += 1
    #for km in replicates_list:
    #for r in reference.intersect(b=km, u = True):
    #overlap_counter[(r.start, r.stop)] += 1;

for name, a in name2string.items():
    #if(overlap_counter[name]==2):
    a.append(str(overlap_counter[name]))
    print("\t".join(a))

#for k, v in overlap_counter.items():
#m = name2string[k]
Example #9
0

def get_tpms(path):
    return dict([(x.name, float(x.attrs['tpm'])) for x in BedTool(path)])


def line2score(l):
    return sum([sum(x) for x in l[1:]])


gene2annotation = dict([(x.attrs['ID'], x) for x in BedTool(args.annotation)])

#print(gene2annotation)

label2file = defaultdict(list)
for f in sorted(get_only_files(args.path)):
    label = "_".join(os.path.basename(f).split("_")[:-1])
    label2file[label].append(f)

if (args.order):
    label2file = [(x, label2file[x]) for x in args.order]
else:
    label2file = list(sorted(label2file.items(), key=lambda x: x[0]))

expression = []
labels = []
genes = set()
for label, local_files in label2file:
    local_expr = []
    for lf in local_files:
        #print(lf)
Example #10
0
    plt.xticks(xvals, xticks, rotation=45)

    plt.savefig(os.path.join(args.outdir, "%s.%s" % (name, args.format)),
                format=args.format)
    plt.clf()
    plt.close()


if (args.mode == 'length'):
    original = [(int(x.name), float(x.score)) for x in BedTool(args.original)]
    original.sort(key=lambda x: x[0])
    #for r1, r2 in split2chunks(original, 2):
    #print(abs(r1[0] - r2[0]))

    length2step = []
    for path in [x for x in get_only_files(args.detected) if 'annotated' in x]:
        name = get_name(path, args.mode)
        xticks, yvals = process_detected(path, original)
        length2step.append((int(name),
                            find_step_for_sample(yvals, xticks, name,
                                                 args.detection_fraction)))
        draw_single(yvals, xticks, name)
    xlabel = 'Read length'

elif (args.mode == 'ratio'):
    original_dict = {}
    for path in get_only_files(args.original):
        name = os.path.basename(path).split(".")[0][1:]
        original = [(int(x.name), float(x.score)) for x in BedTool(path)]
        original.sort(key=lambda x: x[0])
        original_dict[name] = copy.copy(original)
Example #11
0
import yaml
from afbio.generators import get_only_files

_confdir = os.path.dirname(os.path.realpath(__file__))


class LocalConfigError(Exception):
    pass


#CONFIGS is used for shortcuts while calling configuration file
#CONFIGS = {'samstat': 'samstat.yml', 'bedstat': 'bedstat.yml', 'chiflex': 'chiflex.yml', 'doublechiflex': 'doublechiflex.yml', 'chipchap': 'chipchap.yml', 'lrg': 'lrg.yml'}

CONFIGS = {}
for path in get_only_files(_confdir):
    CONFIGS[os.path.basename(path).split(".")[0]] = path


def load_config(configuration):
    p = CONFIGS.get(configuration)
    if (not p):
        p = configuration

    with open(p, 'r') as f:
        d = yaml.load(f, Loader=yaml.FullLoader)
    if (isinstance(d, dict)):
        return d
    else:
        raise LocalConfigError(
            'Config file %s is malformatted. It has to be convertible into dictionary'
Example #12
0
def parse_file(path):
    with open(path) as f:
        labels = next(f).strip().split("\t")[1:]
        expression = [[float(y) for y in x.split(",")]
                      for x in next(f).strip().split("\t")[1:]]
        variants, var_names = [], []
        for l in f:
            a = l.strip().split("\t")
            variants.append([[float(y) for y in x.split(",")] for x in a[1:]])
            var_names.append(a[0])
        return labels, var_names, expression, variants


normal_list, spurious_list = [], []
for path in [x for x in get_only_files(args.path) if x.endswith("tsv")]:
    name = os.path.basename(path).split(".")[0]
    labels, var_names, expression, variants = parse_file(path)
    res = check(labels, var_names, expression, variants, args.minexpr,
                args.mindiff, args.minfraction)
    if (res[0]):
        normal_list.append((name, res[0]))
    if (res[1]):
        spurious_list.append((name, res[1]))

normal_list.sort(reverse=True, key=lambda x: x[1][-1])
spurious_list.sort(reverse=True, key=lambda x: x[1][-1])
with open(os.path.join(args.outdir, "normal.tsv"), 'w') as f:
    header = [
        "gene", "time1", "time2", "tss", "score", "expression1", "expression2",
        "fraction1", "fraction2"