Beispiel #1
0
    def depth_info(self):

        cmd = '{igvtools} count -w {window_size} {infile} {outdir}/depthRaw_{outsuffix}.wig novo37'.format(
            **self.__dict__)
        print 'run cmd:', cmd
        assert not os.system(cmd)

        out_wig = '{outdir}/depthRaw_{outsuffix}.wig'.format(**self.__dict__)
        outfile = '{outdir}/depth_{outsuffix}'.format(**self.__dict__)

        with utils.safe_open(out_wig) as f, utils.safe_open(outfile,
                                                            'w') as out:
            for line in f:
                if line.startswith('track'):
                    continue
                elif line.startswith('variableStep'):
                    chrom = re.findall(r'chrom=(.+?) ', line)[0].strip('chr')
                    continue
                linelist = line.strip().split('\t')
                start = int(linelist[0])
                depth = float(linelist[1])

                depth_log10 = math.log10(depth + 1)

                end = start + self.window_size - 1
                if start + self.window_size > CHROM_LENGTH[chrom]:
                    end = CHROM_LENGTH[chrom]

                line = 'hs{chrom}\t{start}\t{end}\t{depth_log10}\n'.format(
                    **locals())
                out.write(line)

        print 'write file: {}'.format(outfile)
Beispiel #2
0
    def cnv_info(self):

        outfile = '{outdir}/{vtype}_{outsuffix}'.format(**self.__dict__)

        with utils.safe_open(self.infile) as f, utils.safe_open(outfile,
                                                                'w') as out:
            print 'open file: {}'.format(self.infile)
            for line in f:
                linelist = line.strip().split('\t')
                if linelist[0] in ('Chr', '#Chr'):
                    headerlist = linelist
                    continue
                chrom = linelist[0].strip('Chr').strip('chr')
                start = linelist[headerlist.index('Start')]
                end = linelist[headerlist.index('End')]

                if chrom not in self.normal_chrom:
                    continue

                if self.vtype == 'freec':
                    copynumber = int(
                        linelist[headerlist.index('CopyNumber')]) - 2
                    copynumber = 6 if copynumber > 6 else copynumber
                elif self.vtype == 'cnvnator':
                    copynumber = float(linelist[headerlist.index('RD')])
                    copynumber = 3 if copynumber > 3 else copynumber

                line = 'hs{chrom} {start} {end} {copynumber}\n'.format(
                    **locals())
                out.write(line)

        print 'write file: {}'.format(outfile)
Beispiel #3
0
def create_nginx_redirect_config(env_file, hostname):
    if hostname.startswith("www."):
        redirect_from = hostname[4:]
    else:
        redirect_from = "www.%s" % hostname

    config_dst = Path(os.path.join(VHOSTD_DIR, redirect_from))

    redirect_prompt = (
        "Do you want to set 301 redirect from %s to %s? "
        "(requires domain to be already configured)"
    ) % (redirect_from, hostname)
    if input_bool(redirect_prompt):
        env_file["LETSENCRYPT_HOST"] = env_file["VIRTUAL_HOST"]
        env_file.save()
    else:
        env_file["LETSENCRYPT_HOST"] = hostname
        env_file.save()
        if config_dst.is_file():
            config_dst.unlink()
        return False

    with safe_open(REDIRECT_CONFIG, "r") as f:
        tpl = f.read()
    redirect_config = tpl.replace("domain.com", hostname)

    with safe_open(config_dst, "w") as f:
        f.write(redirect_config)

    return True
Beispiel #4
0
    def write_ped(self, pedfile, context):

        with utils.safe_open(pedfile, 'w') as ped:
            line = '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context)
            line += '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context['pa_context'])
            line += '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context['ma_context'])
            ped.write(line)
Beispiel #5
0
def main():
    global no_neighbor_count
    with safe_open(outpath, exist_ok='exit') as outfile:
        # write header
        #outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance'])+'\n')

        relations = pd.DataFrame()

        for head_group_name, head_group in tqdm(head_groups, desc='Contigs'):

            if head_group_name not in gene_groups.groups:
                prinf(
                    'Não há nenhum gene no cromossomo. As heads abaixo não possuem NG.'
                )
                prinf(head_group)
                no_neighbor_count += head_group.shape[0]
                continue

            gene_group = gene_groups.get_group(head_group_name)
            chunks = np.array_split(head_group, n_cpu)

            with mp.Pool() as pool:
                pool_results = pool.starmap(parse_chunk,
                                            ((c, gene_group, cn)
                                             for cn, c in enumerate(chunks)))

                for chunk_relations in pool_results:
                    relations = relations.append(chunk_relations)
                    # print('\nCHUN', chunk_relations, '\nREL', relations)

        relations.columns = ['head_id', 'gene_id', 'flag', 'distance']
        relations.to_csv(outfile, sep='\t', index=False)
        log(f'\nConcluído. Relações salvas em {str(outpath)}.')

    return relations, no_neighbor_count
    def render_html(self):
        if self.rep_ty == 'qc':
            self.context['report_type'] = 'QC'
            self.analy_type = '质控'

        elif self.rep_ty == 'mapping':
            self.context['report_type'] = 'Mapping'
            self.context['mapping'] = True
            self.analy_type = '比对'

        elif self.rep_ty == "primary":
            self.context['report_type'] = 'Primary'
            self.context['mapping'] = True
            self.context['primary'] = True
            self.analy_type = '基本分析'

        elif self.rep_ty == "advance":
            self.context['report_type'] = 'Advance'
            self.context['mapping'] = True
            self.context['primary'] = True
            self.context['advance'] = True
            self.analy_type = '高级分析'

        else:
            sys.exit("plz select rigth report type ('qc', 'mapping', 'primary', 'advance')")
        
        self.context['analy_type'] = self.analy_type
        check_html = self.env.get_template('TestDemo.html').render(self.context)
        outfile = os.path.join(self.checkDir , self.rep_ty+'_check.html')
        with utils.safe_open(outfile, 'w') as out:
            out.write(check_html)

        return outfile
Beispiel #7
0
    def get_indel_info(self):
        '''处理indel vcf文件
        处理indel的vcf文件时,为了和注释的pos,以及ref和alt对应,需要对vcf做处理
        {
            'chr_pos_ref/alt': [(case), (control)]
        }
        '''
        with utils.safe_open(self.vcf, 'r') as fr:
            for line in fr:
                if line.startswith('##'):
                    pass
                elif line.startswith('#'):
                    head = line.strip('')
                    head_index = utils.get_head_index(head)
                    continue

                linelist = line.strip('').split('\t')
                _chr = linelist[head_index['#chrom']]
                pos = linelist[head_index['pos']]
                ref = linelist[head_index['ref']]
                alt = linelist[head_index['alt']]

                pos, ref, alt = utils.modify_pos_ref_alt(pos, ref, alt)
                if 'cancer' in head_index:
                    case = linelist[head_index['cancer']]
                if 'normal' in head_index:
                    control = linelist[head_index['normal']]
Beispiel #8
0
def main():

    infile = args['infile']
    filetype = args['type']
    add_header = args['add_header']

    outdir = args['outdir']

    # print args;exit()

    if len(infile) == 1:
        infile_list = re.split(r'\s+|;|:|,', infile[0])
    else:
        infile_list = infile

    for infile in infile_list:

        outfile = infile.replace('.xls', '.brief.xls')
        if outdir:
            outfile = os.path.join(outdir, os.path.basename(outfile))

        with safe_open(infile) as f, safe_open(outfile, 'w') as out:

            if add_header:
                header_file = os.path.join(BASE_DIR,
                                           'header/{}.header'.format(filetype))
                print 'add header:', header_file
                for line in get_added_header(header_file):
                    # print line
                    out.write(line)

            for line in f:
                linelist = line.rstrip('\n').split('\t')
                # if all(h in linelist for h in ['CHROM', 'POS']):
                if linelist[0] in ('Priority', 'Chr', 'CHROM'):
                    indices = list(get_indices(linelist, header_map[filetype]))
                    new_header = get_new_line(linelist, indices)
                    # print new_header
                    out.write(new_header)
                    continue
                new_line = get_new_line(linelist, indices)
                # print new_line
                out.write(new_line)

        print 'write brief file:', outfile
Beispiel #9
0
def align():
    out_file = safe_open(out_path)  # Check out_path.

    if out_file is not None:
        print('Alinhando heads contra heads...', end=' ')
        # Remember you are using megablast.
        run(f"blastn -task 'megablast' -query '{heads_path}' -subject '{heads_path}'"
            f" -outfmt '6 {COLUMNS}' -out '{out_path}' -evalue 1e-10"
            f" -num_threads {n_cpu}", shell=True)
        print(f'Alinhamentos salvos em {out_path}.\n')
Beispiel #10
0
    def get_samples(self):

        with utils.safe_open(self.__dict__['infile']) as f:
            for line in f:
                if line.startswith('#CHROM'):
                    linelist = line.strip().split('\t')
                    samplelist = linelist[linelist.index('FORMAT') + 1:]
                    break

        return ','.join(samplelist)
    def save(self):
        lines = []
        lines.append("# %s" % self.header)
        lines.append("# %s\n" % datetime.now())

        for key in sorted(self.variables.keys()):
            value = self.variables[key]
            lines.append("%s=%s" % (key, value))

        with safe_open(self.path, "w") as f:
            f.write("\n".join(lines))
Beispiel #12
0
    def write_ped_ws(self, pedfile, wsfile, pedlist):

        samples_with_data = []

        with utils.safe_open(pedfile, 'w') as pf, utils.safe_open(wsfile,
                                                                  'w') as wf:
            ws_count = []
            n = 0
            for ped in pedlist:
                sampleid = ped['sampleid']
                if self.sample_infos_all[sampleid]['data'] != '0':
                    samples_with_data.append(sampleid)
                    n += 1
                    ws_count.append(n)
                else:
                    ws_count.append(0)
                ped_text = '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(
                    **ped)
                pf.write(ped_text)
            # print ws_count
            ws_text = ' '.join(map(str, ws_count)) + '\n'
            wf.write(ws_text)

        return samples_with_data
Beispiel #13
0
    def samtools_call_hapmap(self, familyid, samples_with_data):

        vcf_list = '{analydir}/Advance/{newjob}/Linkage/{familyid}/vcf_{familyid}.list'.format(
            **dict(self.__dict__, **locals()))
        with utils.safe_open(vcf_list, 'w') as out:
            for sampleid in samples_with_data:
                out.write('{}.vcf\n'.format(sampleid))

        for sampleid in samples_with_data:
            print '>    samtools call hapmap for', sampleid

            cmd = '''
                set -eo pipefail
                echo samtools call hapmap for {sampleid} start: `date "+%F %T"`

                cd {analydir}/Advance/{newjob}/Linkage/{familyid}

                samtoolsv0.1.19 mpileup \\
                    -d 10000 -C 50 -D -S -m 2 -F 0.02 -q 13 -Q 13 \\
                    -gf {reffasta} \\
                    -l {moduledir}/Linkage/annotHapMap2L.txt \\
                    {analydir}/Mapping/{sampleid}.{sampleid}/{sampleid}.final.bam |
                bcftools_lh view \\
                    -cg -t 0.5 \\
                    -> {sampleid}.vcf

                echo samtools call hapmap for {sampleid} done: `date "+%F %T"`
            '''.format(**dict(self.__dict__, **locals()))

            shell_path = '{analydir}/Advance/{newjob}/Linkage/{familyid}/samtools_call_hapmap_{sampleid}.sh'.format(
                **dict(self.__dict__, **locals()))

            utils.write_shell(shell_path, cmd)

            # add job
            now_point = 'samtools_call_hapmap'
            job_name = 'samtools_call_hapmap_{sampleid}'.format(**locals())
            utils.add_job(self.jobs, now_point, self.args['startpoint'],
                          self.ANALYSIS_POINTS, job_name, shell_path,
                          self.queues)

            # add order
            before_jobs = ['final_bam_{sampleid}'.format(**locals())]
            after_jobs = ['linkdatagen_{familyid}'.format(**locals())]
            utils.add_order(self.orders,
                            job_name,
                            before_jobs=before_jobs,
                            after_jobs=after_jobs)
Beispiel #14
0
def text2excel(outfile, *infiles):
    '''
    infiles can be a string: 'a.xls,b.xls'          ('a.xls,b.xls', )
                  or a list: ['a.xls', 'b.xls']     (['a.xls', 'b.xls'], )
    or many positional args: 'a.xls', 'b.xls'       ('a.xls', 'b.xls')
    '''

    # wb = openpyxl.Workbook(encoding='utf8')
    wb = openpyxl.Workbook()

    if len(infiles) == 1:
        if isinstance(infiles[0], str):
            infile_list = infiles[0].split(',')
        elif isinstance(infiles[0], list):
            infile_list = infiles[0]

    elif len(infiles) >= 2:
        infile_list = list(infiles)

    else:
        exit('error infiles: {}'.format(infiles))

    for infile in infile_list:
        sheetname = get_sheetname(infile)
        print 'create sheet:', sheetname
        sheet = wb.create_sheet(title=sheetname)
        # with codecs.open(infile, mode='r', encoding='gbk', errors='ignore') as f:
        with safe_open(infile) as f:
            for n, line in enumerate(f):
                row = n + 1
                linelist = line.strip().split('\t')
                for m, value in enumerate(linelist):
                    column = m + 1
                    sheet.cell(row=row, column=column, value=value)

    # remove default sheet
    try:
        wb.remove(wb['Sheet'])
    except AttributeError:
        wb.remove_sheet(wb.get_sheet_by_name('Sheet'))  # for the old version

    outdir = os.path.dirname(outfile)
    if outdir:
        mkdir_if_not_exists(outdir)

    wb.save(filename=outfile)

    print 'write excel file:', outfile
Beispiel #15
0
def main():
    for query in QUERIES:

        query_path = pardir / f'seqs/{query}.fa'
        out_path = pardir / f'alinhamentos/{query}_vs_genome.bl'
        out_file = safe_open(out_path)

        if out_file is None:
            continue

        print(f'Procurando alinhamentos de {query} contra genoma...')
        run((
            f"blastn -task blastn -query {str(query_path)} -db {str(genomedb_path)} "
            f"-outfmt '6 {' '.join(BL_COLUMNS)}' -out {str(out_path)} "
            f"-evalue 1e-10 -num_threads {n_cpu}"),
            shell=True)
        print(f'Alinhamentos salvos em {str(out_path)}.\n')
Beispiel #16
0
def main():
    with u.safe_open(outpath, exist_ok=False) as outfile:
        raw_annotations = read_csv(raw_annotations_path,
                                   sep='\t',
                                   comment='#',
                                   header=None,
                                   names=GFF3_COLUMNS)

        print('Leitura encerrada. Removendo anotações não-gênicas...')

        genes_gff = raw_annotations.loc[raw_annotations['type'] == 'gene']
        genes_gff.loc[:, ['start', 'end']] = genes_gff[['start',
                                                        'end']].astype(int)

        lengths = genes_gff.end - genes_gff.start
        genes_gff.loc[:, 'attributes'] = genes_gff.attributes.str.replace(
            'ID=gene:', 'gene_id=')
        genes_gff['attributes'] = genes_gff.attributes.str.extract(
            r'(gene_id.*Name[^;]+)')
        # com loc não funciona (?!):
        # genes_gff.loc[:, 'attributes'] = genes_gff.attributes.str.extract(r'(gene_id.*Name[^;]+)')
        genes_gff.loc[:, 'attributes'] += ';length=' + lengths.astype(str)

        # ###### REMOVER GENES COM FIM OU INÍCIO COINCIDENTES
        genes_gff = genes_gff.loc[
            lengths.sort_values().index]  # Ordenar por tamanho
        # Manter o maior gene entre os que coincidem.
        genes_gff = genes_gff.drop_duplicates(['seqid', 'start'], keep='last')
        genes_gff = genes_gff.drop_duplicates(['seqid', 'end'], keep='last')

        if genes_gff.duplicated([
                'seqid', 'start'
        ]).sum() or genes_gff.duplicated(['seqid', 'end']).sum():
            print('ERRO: HÁ GENES COM INÍCIO/TÉRMINO DUPLICADOS:')
            print(genes_gff[genes_gff.duplicated(['seqid', 'start'],
                                                 keep=False)])
            print(genes_gff[genes_gff.duplicated(['seqid', 'end'],
                                                 keep=False)])
            raise ValueError

        genes_gff = genes_gff.sort_values(['seqid', 'start'])
        genes_gff.to_csv(outfile, sep='\t', index=False, header=None)

        print(f"Anotações gênicas mantidas em '{str(outpath)}'.")
    def get_tran_relation(self):
        '''
        input:
            self.transcript: gene transcript
        output:
            list: [gene=trans, gene=trans]
        '''
        tran_relation = []
        with utils.safe_open(self.transript_database, 'r') as fr:
            for line in fr:
                if line.startswith('#'):
                    head_index = utils.get_head_index(line)
                    continue
                linelist = line.strip('').split('\t')
                gene = linelist[head_index['#gene']]
                tran = linelist[head_index['transcript']]
                tran_relation.append('{gene}={tran}'.format(**locals()))

        return tran_relation
Beispiel #18
0
def main():
    for kind, pattern in (('head', r'head\d+'), ('gene', r'Smp_\d+')):
        out_path = pardir/f'genome_annotation/{kind}_complement_annotations.gff3'
        outfile = safe_open(out_path, exist_ok=False)
        gff = pd.read_table(pardir/f'genome_annotation/{kind}_annotations.gff3', names=GFF3_COLUMNS)

        print(gff.strand.head())
        gff.loc[gff.strand == '+', 'strand'] = 'plus'
        gff.loc[gff.strand == '-', 'strand'] = '+'
        gff.loc[gff.strand == 'plus', 'strand'] = '-'
        print(gff.strand.head())

        gff['attributes'] = gff.attributes.str.replace(
            pattern,
            lambda match: match.group(0) + '_complement',
            regex=True)

        gff.to_csv(outfile, sep='\t', header=False, index=False)
        print(f"Wrote to '{str(out_path)}'.")
        outfile.close()
Beispiel #19
0
    def make_readme(self):

        self.django_configure()

        title = open(self.args['pn']).read().strip()
        encoding = chardet.detect(title)['encoding']
        if encoding != 'utf8':
            title = title.decode(encoding)
        self.context['title'] = title

        # self.context['software'] = self.softwares

        src = os.path.join(RESULT_DIR, 'src')
        dest = '{Readme}'.format(**self.__dict__)
        self.link_data(src, dest)

        max_code = max(map(float, self.analy_list))
        if max_code < 2:
            report_type = 'qc'
        elif 2 <= max_code < 3:
            report_type = 'mapping'
        elif 3 <= max_code < 6.2:
            report_type = 'primary'
        elif max_code >= 6.2:
            report_type = 'advance'
        # print report_type
        self.context['report_type'] = report_type

        print json.dumps(self.context, ensure_ascii=False, indent=2)
        # print os.path.join(RESULT_DIR, 'templates')
        # template = loader.get_template('test.html')
        # template = loader.get_template('readme_template_chs.html')
        template = loader.get_template('index.html')
        if self.django_old:
            html = template.render(Context(self.context))
        else:
            html = template.render(self.context)
        # print html
        dest_html = os.path.join(dest, 'index.html')
        with utils.safe_open(dest_html, 'w') as out:
            out.write(html)
Beispiel #20
0
    def conifer_call(self, sampleIDs):

        if 'V5' in self.args['TR']:
            probe = 'V5'
        elif 'V6' in self.args['TR']:
            probe = 'V6'
        else:
            print '[Error] Only agilent V5 or V6 can do CoNIFER analysis. '
            exit(1)

        # prepare data for conifer
        outfile = '{analydir}/SV/CoNIFER_{newjob}/sample_for_cnv_call'.format(
            **self.args)
        with utils.safe_open(outfile, 'w') as out:
            for sampleID in sampleIDs:
                bam = '{analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam'.format(
                    sampleID=sampleID, analydir=self.analydir)
                out.write('{}\t{}\n'.format(sampleID, bam))

        REF = 'hg19' if self.__dict__['ref'] == 'b37' else self.__dict__['ref']

        cmd = '''
            set -eo pipefail
            echo cnv call with conifer start: `date "+%F %T"`\n
            cd {analydir}/SV/CoNIFER_{newjob}

            python {moduledir}/Varition/CNV/CoNIFER/conifer_v0.2.2/conifer.pipe4.7.py \\
                --svd 10 \\
                --probe {probe} \\
                --ref {ref} \\
                --in sample_for_cnv_call \\
                --suffix {newjob} \\
                --out {analydir}/SV

            while read s b;do
                python {moduledir}/Varition/CNV/CoNIFER/conifer_v0.2.2/cnv_chrom_plot.py \\
                    {analydir}/SV/$s/conifer/$s.conifer.{REF}_multianno.xls \\
                    {ref} \\
                    {samp_info}
            done < sample_for_cnv_call

            rm -f *.hdf5

            echo cnv call with conifer done: `date "+%F %T"`
            '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/SV/CoNIFER_{newjob}/conifer_call.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'conifer_call'
        job_name = 'conifer_call'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = [
            'final_bam_{sampleID}'.format(sampleID=sampleID)
            for sampleID in sampleIDs
        ]
        after_jobs = ['primary_report']
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Beispiel #21
0
def main():
    filtered_outfile = safe_open(filtered_outpath, exist_ok=False)
    discarded_outfile = safe_open(discarded_outpath, exist_ok=False)
    n_cpu = mp.cpu_count()

    #================== LER E FILTRAR ALINHAMENTOS ==================#

    print('Lendo resultados do Blast...', end=' ')
    perere3_vs_genoma = pd.read_table(perere3_inpath,
                                      header=None,
                                      names=BL_COLUMNS)
    sr3_vs_genoma = pd.read_table(sr3_inpath, header=None, names=BL_COLUMNS)
    print('Resultados lidos.')

    # Sort positions
    # for data in (perere3_vs_genoma, sr3_vs_genoma):
    #     data.sort_values('sstart', inplace=True)
    #     data.reset_index(drop=True, inplace=True)

    print('Buscando alinhamentos em que o SR3 é melhor...')
    discarded = pd.DataFrame()
    filtered_perere3_vs_genoma = perere3_vs_genoma.copy()

    p_groups = perere3_vs_genoma.groupby('saccver')
    s_groups = sr3_vs_genoma.groupby(
        'saccver')  ## agrupar muda index???????????

    print(
        'Iterando para cada scaffold no genoma e para cada perere3 no scaffold.'
    )
    for p_group_name in tqdm(p_groups.groups, desc='Scaffolds'):
        s_group = s_groups.get_group(p_group_name)
        p_group = p_groups.get_group(p_group_name)

        prinf('Combinando DataFrames...', end='\r')
        product = cartesian_product(
            p_group[['sstart', 'send', 'bitscore']].reset_index(),
            s_group[['sstart', 'send', 'bitscore']].reset_index())

        # discard when perere3 aligns better
        prinf('Filtrando por bitscore do SR3...', end='\r')
        product = product.loc[product.bitscore_x < product.bitscore_y]

        if product.empty:
            continue

        prinf('Subdividindo produto...         ', end='\r')
        product_chunks = np.array_split(product, n_cpu)

        prinf('Procurando sobreposições...', end='\r')
        with mp.Pool() as pool:
            chunks_discarded = pool.starmap(parse_product,
                                            enumerate(product_chunks))

        group_discarded = pd.concat(chunks_discarded)
        discarded = discarded.append(group_discarded)
        # print(discarded[~discarded['index'].isin(filtered_perere3_vs_genoma.index)])
        # print(filtered_perere3_vs_genoma.loc[discarded['index'].unique()])

    print(
        f"Escrevendo posições das linhas removidas de '{str(perere3_inpath)}' em '{str(discarded_outpath)}'...",
        end=' ')
    discarded.columns = pd.MultiIndex.from_product([('perere3', 'sr3'),
                                                    ('index', 'sstart',
                                                     'ssend', 'bitscore')])
    discarded.to_csv(discarded_outfile, sep='\t', index=False)
    print('Arquivo escrito.')

    print('Filtrando...', end=' ')
    filtered_perere3_vs_genoma.drop(discarded[('perere3', 'index')],
                                    inplace=True)
    print(f'\nFiltragem concluída. {len(discarded)} alinhamentos removidos.')

    print(
        f"Escrevendo alinhamentos filtrados do perere3 em '{str(filtered_outpath)}'...",
        end=' ')
    filtered_perere3_vs_genoma.to_csv(filtered_outfile, sep='\t', index=False)
    print('Arquivo escrito.')

    return filtered_perere3_vs_genoma, discarded
Beispiel #22
0
import pandas as pd
from matplotlib import pyplot as plt
from sys import argv

if '--sem-sentido' in argv:
    nosense_flag = '_unconsidering_sense'

else:
    print(
        'Estamos considerando sentido por default (--sem-sentido para não considerar).'
    )
    nosense_flag = ''

outpath = pardir / f'genome_annotation/head_genes_correlations{nosense_flag}.tsv'
out_aggregated_counts = pardir / f'counted_reads/aggregated{nosense_flag}.tsv'
outfile = safe_open(outpath)

print('Buscando comprimentos de genes e heads...')
gene_attibutes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3',
                          names=GFF3_COLUMNS,
                          usecols=['attributes'])['attributes']
head_attibutes = read_tsv(pardir / 'genome_annotation/head_annotations.gff3',
                          names=GFF3_COLUMNS,
                          usecols=['attributes'])['attributes']
gene_lengths = parse_gff_attributes(gene_attibutes, gene_id='Name')['length']
head_lengths = parse_gff_attributes(head_attibutes)['length']
lengths = pd.concat([head_lengths, gene_lengths]).astype(int)

print('Concluído. Lendo arquivo de relações...')
relations = read_tsv(
    pardir / f'genome_annotation/head_genes_relations{nosense_flag}.tsv')
Beispiel #23
0
GFF_COLS_SUBSET = ['seqid', 'start', 'end', 'strand', 'attributes']

# sequid é o nome do cromossomo (contig)
if '--com-sentido' in argv:
    COLS_TO_GROUP = ['seqid', 'strand']
    nosense_flag = ''

else:
    print('Não estamos considerando sentido por default. '
          'Use --com-sentido para considerar, ou seja, relacionar '
          'apenas quando cópia e gene estiverem na mesma fita.')
    COLS_TO_GROUP = 'seqid'
    nosense_flag = '_unconsidering_sense'

outpath = pardir / f'genome_annotation/head_genes_relations{nosense_flag}_multiprocessed.tsv'
outfile = safe_open(outpath)
n_cpu = mp.cpu_count()


def parse_head_row(head_row, gene_group, outfile):
    parse_head_row.last_args = head_row, gene_group, outfile

    for _, gene_row in gene_group.iterrows():
        if overlaps((gene_row.start, gene_row.end),
                    (head_row.start, head_row.end)):
            flag = 'olap'
            chosen_gene_id = gene_row.id
            distance = 0
            break

    # if none overlaps
Beispiel #24
0
output_root = '../hyperparameter_tuning/{}/{}/{}/perturbs_{}_sigma{}_temp{}_dweight{}_lr{}'.format(
    distance_function, data_name, model_type, opt, sigma_val, temperature_val,
    distance_weight_val, lr)

num_iter = 10
sigma = np.full(n_examples, sigma_val)
temperature = np.full(n_examples, temperature_val)
distance_weight = np.full(n_examples, distance_weight_val)
to_optimize = [perturbed]
indicator = np.ones(n_examples)
best_perturb = np.zeros(perturbed.shape)
best_distance = np.full(n_examples,
                        1000.)  # all distances should be below 1000
perturb_iteration_found = np.full(n_examples, 1000 * num_iter, dtype=np.int64)
average_distance = np.zeros(num_iter)
with utils.safe_open(output_root + '.txt', 'w') as fout:
    fout.write(
        '{} {} {} --sigma={} --temp={} --distance_weight={} --lr={}\n'.format(
            model_name, opt, distance_function, sigma_val, temperature_val,
            distance_weight_val, lr))
    for i in range(num_iter):
        with tf.GradientTape(persistent=True) as t:
            p_model = utils.filter_hinge_loss(n_class, indicator, perturbed,
                                              sigma, temperature,
                                              prob_from_input)
            approx_prob = tf.gather_nd(p_model, example_class_index)

            if distance_function == 'euclidean':
                distance = utils.safe_euclidean(perturbed - feat_input, axis=1)
            elif distance_function == 'cosine':
                distance = utils.safe_cosine(perturbed, feat_input)
Beispiel #25
0
    def sv_info(self):

        sv_context = defaultdict(list)
        svid_list = []

        with utils.safe_open(self.infile) as f:
            print 'open file: {}'.format(self.infile)
            for line in f:
                linelist = line.strip().split('\t')
                if linelist[0] in ('Chr', ):
                    headerlist = linelist
                    continue

                chrom = linelist[headerlist.index('Chr')].strip('Chr').strip(
                    'chr')
                start = linelist[headerlist.index('Start')]
                end = linelist[headerlist.index('End')]
                func = linelist[headerlist.index('Func')]
                tchr = linelist[headerlist.index('TCHR')].strip('Chr').strip(
                    'chr')
                tstart = linelist[headerlist.index('TSTART')]
                svid = linelist[headerlist.index('SVID')]
                svtype = linelist[headerlist.index('SVType')]

                # 对于同一个ID,只记录第一条
                if svid in svid_list:
                    continue

                # 只保留常染色体+XY
                if any(each not in self.normal_chrom + ['na']
                       for each in [chrom, tchr]):
                    # sys.stderr.write('skip a line of unnormal chrom: ' + line)
                    continue

                # 只保留exonic或splicing区的变异
                if not (func.startswith('exonic')
                        or func.startswith('splicing')):
                    # sys.stderr.write('skip a line of not exonic or splicing: ' + line)
                    continue

                # 跳过breakpoint行
                if svtype == 'breakpoint':
                    continue

                # for lumpy
                if svtype == 'DUP':
                    svtype = 'INS'

                # for breakdancer
                if svtype not in ('CTX', 'ITX', 'DEL', 'INS', 'INV'):
                    svtype = linelist[headerlist.index('TX')][:3].upper()

                end1 = int(start) + 1
                if svtype in ('CTX', 'ITX'):
                    chrom2 = tchr
                    start2 = tstart
                    end2 = int(start2) + 1
                elif svtype in ('DEL', 'INS', 'INV'):
                    chrom2 = chrom
                    start2 = end
                    end2 = int(start2) + 1

                info = 'hs{chrom} {start} {end1} hs{chrom2} {start2} {end2}'.format(
                    **locals())

                svid_list.append(svid)
                sv_context[svtype].append(info)

        for svtype in ('CTX', 'ITX', 'DEL', 'INS', 'INV'):
            outfile = '{outdir}/{vtype}_{svtype}_{outsuffix}'.format(
                **dict(self.__dict__, **locals()))
            with utils.safe_open(outfile, 'w') as out:
                for info in sv_context[svtype]:
                    out.write(info + '\n')
            print 'write file: {}'.format(outfile)
Beispiel #26
0
    def mutation_info(self):

        chrom_region = self._get_chrom_region()

        # print chrom_region['1'].items()[0]

        with utils.safe_open(self.infile) as f:
            print 'open file: {}'.format(self.infile)
            for line in f:
                linelist = line.strip().split('\t')
                if linelist[0] == '#CHROM':
                    headerlist = linelist
                if line.startswith('#'):
                    continue
                chrom = linelist[headerlist.index('#CHROM')]
                pos = int(linelist[headerlist.index('POS')])

                now_region = self._get_now_region(chrom, pos, chrom_region)
                # print chrom, now_region

                if chrom not in self.normal_chrom:
                    continue

                if self.vtype == 'snp':
                    genotype = self._get_genotype(headerlist, linelist)
                    if genotype == 'hom':
                        chrom_region[chrom][now_region][0] += 1
                    else:
                        chrom_region[chrom][now_region][1] += 1
                elif self.vtype == 'indel':
                    chrom_region[chrom][now_region][0] += 1

        density_outfile = '{outdir}/{vtype}_density_{outsuffix}'.format(
            **self.__dict__)
        if self.vtype == 'snp':
            snp_hom_het_ratio_outfile = '{outdir}/{vtype}_ratio_{outsuffix}'.format(
                **self.__dict__)
            snp_hom_het_ratio_out = utils.safe_open(snp_hom_het_ratio_outfile,
                                                    'w')

        chrom_order = map(str, range(1, 23)) + ['X', 'Y']
        with utils.safe_open(density_outfile, 'w') as density_out:
            for chrom, regions in sorted(chrom_region.iteritems(),
                                         key=lambda
                                         (k, v): chrom_order.index(k)):
                for start, end in sorted(regions):
                    site_number = sum(regions[start, end])
                    density = float(site_number) / self.region_length
                    line = 'hs{chrom}\t{start}\t{end}\t{density}\n'.format(
                        **locals())
                    density_out.write(line)

                    if self.vtype == 'snp':
                        hom_ratio = het_ratio = 0
                        if site_number:
                            hom_ratio = regions[start,
                                                end][0] / float(site_number)
                            het_ratio = regions[start,
                                                end][1] / float(site_number)
                        line = 'hs{chrom}\t{start}\t{end}\t{hom_ratio},{het_ratio}\n'.format(
                            **locals())
                        snp_hom_het_ratio_out.write(line)

        print 'write file: {}'.format(density_outfile)

        if self.vtype == 'snp':
            snp_hom_het_ratio_out.close()
            print 'write file: {}'.format(snp_hom_het_ratio_outfile)
Beispiel #27
0
def get_added_header(header_file):

    with safe_open(header_file) as h:
        for line in h:
            yield line
    def process_iobj(self, iobj):
        """
        Processing
        :param iobj: 
        :return: 
        """
        input_name = self.iobj_name(iobj)
        logger.info('Processing: %s' % input_name)

        finish_file = self.get_finish_file(input_name)
        if os.path.exists(finish_file):
            logger.info('Finish indicator file exists, skipping: %s' %
                        finish_file)
            return

        self.cur_decompressor = None
        self.cur_state_file = self.get_state_file(input_name)
        file_leafs = self.get_classification_leafs(input_name)
        file_roots = self.get_classification_roots(input_name)
        self.last_record_resumed = None

        self.processor = newline_reader.NewlineReader(is_json=False)
        handle = iobj
        name = str(iobj)

        if name.endswith('lz4'):
            self.cur_decompressor = lz4framed.Decompressor(handle)
            handle = self.cur_decompressor

        if not self.is_dry() and (not self.args.continue1
                                  or not os.path.exists(file_leafs)
                                  or not os.path.exists(file_roots)):
            utils.safely_remove(file_leafs)
            utils.safely_remove(file_roots)
            self.file_leafs_fh = utils.safe_open(file_leafs,
                                                 mode='w',
                                                 chmod=0o644)
            self.file_roots_fh = utils.safe_open(file_roots,
                                                 mode='w',
                                                 chmod=0o644)

        elif self.args.continue1:
            logger.info('Continuing with the started files')
            self.file_leafs_fh = open(file_leafs,
                                      mode='r+' if not self.is_dry() else 'r')
            self.file_roots_fh = open(file_roots,
                                      mode='r+' if not self.is_dry() else 'r')
            self.restore_checkpoint(iobj)
            self.continue_leafs(file_leafs)

        with iobj:
            resume_token_found = False
            resume_token = None
            resume_idx = 0
            record_ctr = -1
            already_processed = 0
            read_start = self.read_data
            for idx, record in self.processor.process(handle):
                try:
                    record_ctr += 1
                    self.read_data += len(record)

                    # Check the checkpoint distance + boundary - process all newline chunks available
                    if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element:
                        logger.info(
                            '...progress: %s GB, idx: %s, pos: %s GB, '
                            'found: %s, mem: %04.8f MB, readpos: %s (%4.6f GB)'
                            % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx,
                               self.read_data, self.num_found,
                               utils.get_mem_usage() / 1024.0, iobj.tell(),
                               iobj.tell() / 1024.0 / 1024.0 / 1024.0))

                        self.last_report = self.read_data
                        self.try_store_checkpoint(iobj=iobj,
                                                  idx=idx,
                                                  resume_idx=resume_idx,
                                                  resume_token=resume_token)

                        # Flush already seen IP database, not needed anymore
                        # we are too far from the resumed checkpoint
                        if read_start + 1024 * 1024 * 1024 * 2 > self.read_data:
                            self.state_loaded_ips = set()

                    js = json.loads(record)
                    self.process_record(idx, js)

                except Exception as e:
                    logger.error('Exception in processing %d: %s' %
                                 (self.ctr, e))
                    logger.debug(traceback.format_exc())
                    logger.debug(record)

                self.ctr += 1

            logger.info('Total: %d' % self.ctr)
            logger.info('Total_chain: %d' % self.chain_ctr)
            logger.info('Not tls: %d' % self.not_tls)
            logger.info('Not cert ok: %d' % self.not_cert_ok)
            logger.info('Not chain ok: %d' % self.not_chain_ok)
            logger.info('Not parsed: %d' % self.not_parsed)
            logger.info('Not rsa: %d' % self.not_rsa)

        logger.info('Processed: %s' % iobj)
        if not self.is_dry():
            self.file_leafs_fh.close()
            self.file_roots_fh.close()
            utils.try_touch(finish_file)
Beispiel #29
0
def handle_client(conn,addr):
    utils.write_output_formatted(MODE,f"[NEW CONNECTION] {addr} connected.",SERVER_OUTPUT_DIR_LOG)
    connected=True
    while connected:
        pre_msg_header=conn.recv(HEADER)
        msg_header=pre_msg_header.decode(FORMAT)
        conn.send(CONFIRMATION_MSG.encode(FORMAT))

        if msg_header.strip():
            header_elems=msg_header.split('-')
            msg_cat=header_elems[0].strip()
            msg_type=header_elems[1].strip()
            
            if msg_cat=="SEND":
                msg_size=int(header_elems[2].strip())
                if msg_type=="TEXT":
                    msg=conn.recv(msg_size).decode(FORMAT)
                    conn.send(CONFIRMATION_MSG.encode(FORMAT))

                    utils.write_output_formatted(MODE,"Received text message: {}".format(msg),SERVER_OUTPUT_DIR_LOG)

                    if msg==DISCONNECT_MSG:
                        connected=False

                elif msg_type=="FILE":
                    data=utils.receive_chunks(conn,msg_size)
                    conn.send(CONFIRMATION_MSG.encode(FORMAT))

                    filename=msg_header.split('-')[3].strip()
                    with utils.safe_open(f"./{filename}",'wb') as f:
                        f.write(data)
                        
                    utils.write_output_formatted(MODE,"Received file {}".format(filename),SERVER_OUTPUT_DIR_LOG)

            elif msg_cat=="REQUEST":
                if msg_type=="LOG":
                    logpath=utils.get_latest_log()

                    if logpath:
                        msg="True"
                        msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT)
                        conn.send(msg_send)

                        with open(logpath,'rb') as f:
                            logdata=f.read()

                        logdata_size=str(len(logdata)).encode(FORMAT)
                        logdata_size += b' ' * (HEADER-len(logdata_size))
                        conn.send(logdata_size)
                        utils.send_chunks(conn,logdata)
                        utils.write_output_formatted(MODE,"Sent log file {}".format(logpath),SERVER_OUTPUT_DIR_LOG)
                    else:
                        msg="False"
                        msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT)
                        conn.send(msg_send)

                elif msg_type=="PLOT":
                    ticker=header_elems[2].strip()
                    plot=utils.get_plot(ticker)

                    if plot:
                        msg="True"
                        msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT)
                        conn.send(msg_send)
                        with open(plot,'rb') as f:
                            plotdata=f.read()

                        plotdata_size=str(len(plotdata)).encode(FORMAT)
                        plotdata_size += b' ' * (HEADER-len(plotdata_size))
                        conn.send(plotdata_size)
                        utils.send_chunks(conn,plotdata)
                        utils.write_output_formatted(MODE,"Sent plot {}".format(utils.get_plot(ticker)),SERVER_OUTPUT_DIR_LOG)            
                    else:
                        msg="False"
                        msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT)
                        conn.send(msg_send)


    utils.write_output_formatted(MODE,f"Closing connection with {addr}.",SERVER_OUTPUT_DIR_LOG)      
    conn.close()
 def read(self):
     with safe_open(self.path, "r") as f:
         return parse_env_file(f.read())