Esempi in Python per Pipeline.load, esempi in Python per CGATCore.Pipeline.load

Esempio n. 1

0

Mostra file

def loadQcMeasures(infile, outfile):
    '''
    load QC measures into CSVDB
    '''

    P.load(infile, outfile,
           options="--add-index=track")

Esempio n. 2

0

Mostra file

File: pipeline_bamstats.py Progetto: logust79/cgat-flow

def loadExonValidation(infiles, outfile):
    ''' load individual and merged exon validation stats

    For each sample, the exon validation stats are loaded into a table
    named by sample and mapper
    [sample]_[mapper]_overrun

    The merge alignment stats for all samples are merged and loaded
    into single table called exon_validation

    Parameters
    ----------
    infiles : list
       Input filenames with exon validation stats
    outfile : str
       Output filename
    '''

    suffix = ".exon.validation.tsv.gz"

    P.merge_and_load(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 3

0

Mostra file

def loadNCG(outfile):
    '''Load NCG into database'''

    infile = PARAMS["cancergenes_table"]
    # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv"

    P.load(infile, outfile, options="--add-index=symbol")

Esempio n. 4

0

Mostra file

File: PipelineTranscriptDiffExpression.py Progetto: logust79/cgat-flow

def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database,
                         annotations_database):

    tmpfile = P.getTempFilename("/ifs/scratch/")

    table = os.path.basename(gene_info)

    if gene_biotypes:
        where_cmd = "WHERE " + " OR ".join(
            ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")])
    else:
        where_cmd = ""

    select = """SELECT DISTINCT
        gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

    df1 = pd.read_table(infile, sep="\t")
    df1.set_index("test_id", drop=False, inplace=True)

    df2 = pd.read_sql(select, connect(database, annotations_database))
    df2.set_index("gene_id", drop=False, inplace=True)

    df = df1.join(df2)
    df.to_csv(tmpfile, sep="\t", index=True)

    options = "--add-index=gene_id"
    P.load(tmpfile, outfile, options=options)
    os.unlink(tmpfile)

Esempio n. 5

0

Mostra file

def loadMutectExtendedOutput(infile, outfile):
    '''Load mutect extended output into database'''

    infile = infile.replace(".mutect.snp.vcf", "_call_stats.out")

    indices = "contig,position"
    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 6

0

Mostra file

File: PipelineBamStats.py Progetto: logust79/cgat-flow

def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.get_temp_file(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Esempio n. 7

0

Mostra file

def loadSailfish(infile, outfile):
    '''
    load Sailfish gene counts data into
    CSVDB
    '''

    P.load(infile, outfile)

Esempio n. 8

0

Mostra file

def loadExonValidation(infiles, outfile):
    '''merge alignment stats into single tables.'''
    suffix = suffix = ".exon.validation.tsv.gz"
    mergeAndLoad(infiles, outfile, suffix=suffix)
    for infile in infiles:
        track = P.snip(infile, suffix)
        o = "%s_overrun.load" % track
        P.load(infile + ".overrun.gz", o)

Esempio n. 9

0

Mostra file

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile,
           outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

Esempio n. 10

0

Mostra file

File: PipelineMappingQC.py Progetto: logust79/cgat-flow

def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.get_temp_file(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame(
            [['total_mapped_reads', total_mapped_reads],
             ['total_reads', total_reads], ['track', track]],
            columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name, outfile, options="--ignore-empty --add-index=track")
    os.unlink(outf.name)

Esempio n. 11

0

Mostra file

def loadVariantAnnotation(infile, outfile):
    '''Load VCF annotations into database'''

    if infile.endswith("indels.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"
    elif infile.endswith("mutect.snp.annotated.filtered.tsv"):
        indices = "CHROM,POS,SNPEFF_GENE_NAME"

    P.load(infile, outfile, options="--add-index=%(indices)s" % locals())

Esempio n. 12

0

Mostra file

def loadCPCResults(infile, outfile):
    '''
    load the results of the cpc analysis
    '''

    P.load(infile,
           outfile,
           options="--header-names=transcript_id,feature,C_NC,CP_score "
           "--add-index=transcript_id")

Esempio n. 13

0

Mostra file

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    P.load(infile + ".map",
           outfile,
           options="--add-index=snp_id "
           "--add-index=track,transcript_id "
           "--add-index=contig,pos "
           "--add-index=protein_id "
           "--add-index=transcript_id ")

Esempio n. 14

0

Mostra file

File: PipelineBamStats.py Progetto: logust79/cgat-flow

def loadTranscriptProfile(infiles, outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.get_temp_file(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)

Esempio n. 15

0

Mostra file

def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.open_file(infile, "r"))

    tmpfile = P.get_temp_file()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches], outname)

        for match in motifs.matches:

            distance = abs(match.start + match.width1 -
                           (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id, x, match.start, match.end, strand, arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)

Esempio n. 16

0

Mostra file

File: PipelineBamStats.py Progetto: logust79/cgat-flow

def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)

    outf = P.get_temp_file(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.open_file(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)

Esempio n. 17

0

Mostra file

def loadManualAnnotations(infile, outfile):

    tmp = P.get_temp_filename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.open_file(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.open_file(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)

Esempio n. 18

0

Mostra file

def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt", outfile, options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Esempio n. 19

0

Mostra file

File: pipeline_motifs.py Progetto: logust79/cgat-flow

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Esempio n. 20

0

Mostra file

def loadExpression(infile, outfile):

    if not os.path.isfile(outfile):
        P.load(infile,
               outfile,
               options="-i Sample -i gene_id -i transcript_id",
               job_memory="16G")
    else:
        pass

    if not os.path.isfile("expression.dir/utrons_expression.txt"):
        subprocess.call([
            "sqlite3", PARAMS["database_name"], ".headers on", ".mode tab",
            ".output expression.dir/utrons_expression.txt",
            "select * from utrons_expression"
        ])
    else:
        pass

Esempio n. 21

0

Mostra file

File: pipeline_motifs.py Progetto: logust79/cgat-flow

def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)

Esempio n. 22

0

Mostra file

File: pipeline_transfacmatch.py Progetto: logust79/cgat-flow

def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    P.load(temp.name, outfile, options="--add-index=seq_id")
    os.unlink(temp.name)

Esempio n. 23

0

Mostra file

File: PipelineDeNovoMotifs_python3.py Progetto: sudlab/pipeline_denovo_motifs

def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.to_table(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("id")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.get_temp_file(".")

    # parse the text file
    for line in IOTools.open_file(infile):
        if line.startswith("#Query"):
            tmpfile.write('\t'.join(("target_name", "query_id", "target_id",
                                     "optimal_offset", "pvalue", "evalue",
                                     "qvalue", "Overlap", "query_consensus",
                                     "target_consensus", "orientation")) +
                          "\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)

Esempio n. 24

0

Mostra file

File: pipeline_utrons_filtered_geneset.py Progetto: sudlab/pipeline_utrons

def identify_splice_sites(infiles, outfiles):
    infile = infiles
    outfile, outfile_load = outfiles
    current_file = __file__
    pipeline_path = os.path.abspath(current_file)
    pipeline_directory = os.path.dirname(pipeline_path)
    script_path = "pipeline_utrons/splicesites_start_end_sizes.py"
    ss_path = os.path.join(pipeline_directory, script_path)

    statement = ''' python %(ss_path)s %(infile)s %(outfile)s;
                    sort -u %(outfile)s > %(outfile)s_2.txt; rm %(outfile)s; mv %(outfile)s_2.txt %(outfile)s;
                    sed -i $'1i transcript_id\\tstrand\\tss5\\tss3\\tcontig\\tsplice_site_start\\tsplice_site_end\\tutron_size' %(outfile)s '''
    P.run(statement)
    P.load(
        outfile,
        outfile_load,
        options=
        "-i transcript_id -i ss5 -i ss3 -i splice_site_start -i splice_site_end -i utron_size",
        job_memory="16G")

Esempio n. 25

0

Mostra file

def loadEffects(infile, outfile):
    '''load transcript effects into tables.'''

    root = infile[:-len(".effects.gz")]

    P.load(infile,
           outfile,
           tablename=root + "_effects",
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation"):

        P.load(infile,
               outfile,
               tablename=root + "_effects_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")

Esempio n. 26

0

Mostra file

def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("method\ttrack\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        method = re.match("(.+).dir/", infile).groups()[0]
        track = os.path.basename(".".join(infile.split(".")[:-1]))
        outf.write("%s\t%s\n" % (method,track))

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Esempio n. 27

0

Mostra file

def mergeAnnotations(infiles, outfile):
    '''load variant annotations into single database table'''

    tablename = P.toTable(outfile)
    outf = open('anno.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".annotations.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load('anno.text', outfile)

Esempio n. 28

0

Mostra file

File: pipeline_promotors.py Progetto: logust79/cgat-flow

def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals())

    transcripts = [
        x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").
        fetchall()
    ]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([
            x[0] for x in
            cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" %
                       locals()).fetchall()
        ])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id, "\t".join(
            [str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)

Esempio n. 29

0

Mostra file

File: pipeline_motifs.py Progetto: logust79/cgat-flow

def loadMemeChipSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        fn = P.snip(os.path.basename(infile), ".memechip")

        track, npeaks, width, masking = fn.split(".")
        outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) +
                   "\n")

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)

Esempio n. 30

0

Mostra file

def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)