コード例 #1
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
コード例 #2
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)
コード例 #3
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute( """DROP TABLE IF EXISTS %(table)s""" % locals() )

    transcripts = [x[0] for x in cc.execute(
        "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").fetchall()]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall()])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id,
                                 "\t".join([str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
コード例 #4
0
def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | cgat csv2db
                      %(csv2db_options)s
                      --add-index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)
コード例 #5
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
コード例 #6
0
def loadPicardGCStats(infiles, outfile):
    '''Merge Picard insert size stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".gcstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                   | python %(scriptsdir)s/csv2db.py
                      %(csv2db_options)s
                      --add-index=track
                      --table=%(tablename)s 
                   > %(outfile)s '''
    P.run()

    os.unlink(tmpfilename)
コード例 #7
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
コード例 #8
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
コード例 #9
0
def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
コード例 #10
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database_name"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --map-tsv-file=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
コード例 #11
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [x[0]
              for x in cc.execute("SELECT motif FROM motif_info").fetchall()]

    for motif in motifs:

        tmpf = P.getTempFile(".")

        for infile in infiles:
            table = P.toTable(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(
            PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink(tmpf.name)
コード例 #12
0
def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    dbhandle = PipelineUCSC.connectToUCSC(
        host=PARAMS["ucsc_host"],
        user=PARAMS["ucsc_user"],
        database=ucsc_database)

    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(shared=True)

    total_repeats = 0
    for table in tables:
        E.info("%s: loading repeats from %s" % (ucsc_database, table))
        cc = dbhandle.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.',
            strand, '.',
            CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
            FROM %(table)s
            WHERE repClass in ('%(repclasses)s') """ % locals())
        n = 0
        for data in cc.fetchall():
            n += 1
            tmpfile.write("\t".join(map(str, data)) + "\n")
        E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n))
        total_repeats += n

    if total_repeats == 0:
        raise ValueErrror("did not find any repeats for %s" % ucsc_database)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''cat %(tmpfilename)s
    | %(pipeline_scriptsdir)s/gff_sort pos
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
コード例 #13
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing-value=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
コード例 #14
0
def buildProbe2GeneMap(infile, 
                       outfile,
                       PARAMS,
                       platform = "affy"):
    '''
    build file mapping probe id to gene id
    '''
    if platform == "affy":
        array = PARAMS.get("affy_array")
        dataset = PARAMS.get("affy_dataset")
        
        R('''library("biomaRt")''')
        R('''library("affy")''')
        R('''dat <- ReadAffy()''')

        E.info("getting probes")
        R('''probes <- featureNames(dat)''')

        E.info("getting mart")
        R('''mart <- useMart("ensembl", dataset = "%s")''' % dataset)

        # matches to hgnc symbol - this might not be appropriate for mouse data...
        E.info("mapping probes to gene")
        R('''probe2gene <- getBM(attributes = c("%s", "external_gene_name"), filters = "%s", values = probes, mart = mart)''' % (array, array))
        R('''colnames(probe2gene) <- c("probe", "gene")''')
        R('''probe2gene$gene <- toupper(probe2gene$gene)''')

        # remove probes that have no gene assignment (i.e returned "" from biomaRt) and those with 
        # multiple gene assignments - cross-hyb
        temp = P.getTempFile(".")
        E.info("writing temp file")
        R('''write.table(probe2gene, file = "%s", sep = "\t", row.names = F)''' % temp.name)
        temp.close()
        E.info("filtering probes")
        inf = open(temp.name)
        header = inf.readline()
        outf = open(outfile, "w")
        outf.write(header)
        counts = collections.defaultdict(int)
        probe2gene = {}
        for line in inf.readlines():
            data = line[:-1].split("\t")
            probe, gene = data[0], data[1]
            if gene.strip('"') == '': continue
            probe2gene[probe] = gene
            counts[probe] += 1
        for probe, count in probe2gene.iteritems():
            if count > 1:
                outf.write("%s\t%s\n" % (probe, probe2gene[probe]))
        outf.close()
        os.unlink(temp.name)
    else:
        R('''
          library(limma)
          # read in data - maintain detection p-values for bg correction
          dat <- read.ilmn(files = "%s", other.columns = "Detection")
          probe2gene <- data.frame("probe" = rownames(dat), "gene" = dat$genes$TargetID)
          write.table(probe2gene, file = "%s", row.names = F, sep = "\t")
          ''' % (infile, outfile))
コード例 #15
0
def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
                                           ['total_reads', total_reads],
                                           ['track', track]], columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
コード例 #16
0
def loadIdxstats(infiles, outfile):
    '''take list of file paths to samtools idxstats output files
    and merge to create single dataframe containing mapped reads per
    contig for each track. This dataframe is then loaded into
    database.

    Loads tables into the database
        * idxstats_reads_per_chromosome

    Arguments
    ---------
    infiles : list
        list where each element is a string of the filename containing samtools
        idxstats output. Filename format is expected to be 'sample.idxstats'
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.getTempFile(".")
    dfs = []
    for f in infiles:
        track = P.snip(f, ".idxstats").split('/')[-1]

        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue

        # reformat idx stats
        df = pandas.read_csv(f, sep='\t', header=None)
        df.columns = ['region', 'length', 'mapped', 'unmapped']

        # calc total reads mapped & unmappedpep
        total_reads = df.unmapped.sum() + df.mapped.sum()
        total_mapped_reads = df.mapped.sum()

        reformatted_df = pandas.DataFrame(
            [['total_mapped_reads', total_mapped_reads],
             ['total_reads', total_reads], ['track', track]],
            columns=(['region', 'mapped']))

        # reformat the df
        df = df.append(reformatted_df, ignore_index=True)
        df.set_index('region', inplace=True)
        df1 = df[['mapped']].T
        # set track as index
        df1.set_index('track', inplace=True)
        dfs.append(df1)

    # merge dataframes into single table
    master_df = pandas.concat(dfs)
    master_df.drop('*', axis=1, inplace=True)
    # transform dataframe to avoid reaching column limit
    master_df = master_df.T
    master_df.to_csv(outf, sep='\t', index=True)
    outf.close()

    P.load(outf.name, outfile, options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
コード例 #17
0
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
コード例 #18
0
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(IOTools.openFile(infile, "r"))

    tmpfile = P.getTempFile()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
コード例 #19
0
def loadTranscriptProfile(infiles,
                          outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(
            infile
        ) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)
コード例 #20
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing-value=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
コード例 #21
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
    '''output a bed file with genomic regions with functional annotations.

    The regions for each gene are given in the gtf file.

    Each bed entry is a gene territory. Bed entries are labeled
    by functional annotations associated with a gene.

    Ambiguities in territories are resolved by outputting
    annotations for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.
    '''
    territories_file = gtffile

    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    cc = dbh.cursor()

    outf = P.getTempFile(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write(
                "\t".join(map(str, (
                    contig, start, end,
                    "%s:%s" % (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq
    | gzip > %(outfile_bed)s'''

    P.run()

    outf = IOTools.openFile(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.iteritems():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()

    os.unlink(tmpfname)
コード例 #22
0
def loadTranscriptProfile(infiles, outfile,
                          suffix="transcript_profile",
                          tablename=None):
    '''load transcript profiles into one table.
    Arguments
    ---------
    infiles : string
        Filenames of files with matrix from bam2geneprofile. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s" % (suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:

        matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz"
        name = P.snip(os.path.basename(infile), ".transcriptprofile.gz")

        table = pd.read_csv(matrix_file, sep="\t")
        table.rename(columns={'none': name}, inplace=True)
        table.drop(["area", "counts", "background"], axis=1, inplace=True)

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=["bin", "region", "region_bin"],
                                     how="left")
    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=bin")

    os.unlink(outf.name)
コード例 #23
0
def buildPicardDuplicationStats(infile, outfile):
    '''run picard:MarkDuplicates

    Record duplicate metrics using Picard, the marked records
    are discarded.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.getTempFile(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\
                                    -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY)

    statement += '''MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()

    os.unsetenv("CGAT_JAVA_OPTS")

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
コード例 #24
0
def buildPicardDuplicationStats(infile, outfile):
    '''run picard:MarkDuplicates

    Record duplicate metrics using Picard, the marked records
    are discarded.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''

    job_memory = PICARD_MEMORY
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.getTempFile(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\
                                    -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY)

    statement += '''MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run()

    os.unsetenv("CGAT_JAVA_OPTS")

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
コード例 #25
0
def loadCountReads(infiles,
                   outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
コード例 #26
0
def load_last_exon_chunks(infile, outfile):
    '''Load gene and exon_ids for last exons into database'''

    from CGAT import GTF

    with P.getTempFile(shared=True) as tmpfile:
        tmpfile.write("gene_id\tchunk_id\n")
        for exon in GTF.iterator(IOTools.openFile(infile)):
            tmpfile.write("\t".join(
                [exon.gene_id, re.sub(";", "", exon["exon_id"])]) + "\n")
        tmpfn = tmpfile.name

    P.load(tmpfn, outfile, options="-i gene_id -i exon_id")
    os.unlink(tmpfn)
コード例 #27
0
def buildCDSFasta(infile, outfile):
    '''load ENSEMBL cdna FASTA file

    *infile* is an ENSEMBL cdna file.
    '''

    dbname = outfile[:-len(".fasta")]
    # infile_peptides, infile_cdnas = infiles

    statement = '''gunzip < %(infile)s
    | python %(scriptsdir)s/gff2fasta.py
        --is-gtf
        --genome=%(genome_dir)s/%(genome)s
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force-output -
    > %(dbname)s.log
    '''
    P.run()
    return

    tmpfile = P.getTempFile(".")

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    tmpfile.write("protein_id\ttranscript_id\n")
    tmpfile.write("\n".join(
        ["%s\t%s" % x for x in
         cc.execute(
             "SELECT DISTINCT protein_id, transcript_id "
             "FROM transcript_info")]))
    tmpfile.write("\n")

    tmpfile.close()

    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/peptides2cds.py
           --peptides-fasta-file=%(infile_peptides)s
           --cdnas=%(infile_cdnas)s
           --map=%(tmpfilename)s
           --output-format=fasta
           --log=%(outfile)s.log
    | python %(scriptsdir)s/index_fasta.py
    %(dbname)s --force-output -
    > %(dbname)s.log
    '''

    P.run()
    os.unlink(tmpfilename)
コード例 #28
0
def loadCountReads(infiles, outfile,
                   suffix="nreads",
                   pipeline_suffix=".nreads",
                   tablename=None):
    '''load read counts.
    Arguments
    ---------
    infiles : string
        Filenames of files with number of reads per sample. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    outf.write("%s\t%s\n" % ("track", "nreads"))

    for filename in infiles:
        track = P.snip(os.path.basename(filename), pipeline_suffix)

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        for line in lines:
            count = line.split("\t")[1]
            outf.write("%s\t%s\n" % (track, count))

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)
コード例 #29
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)
コード例 #30
0
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(IOTools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --method=sort --sort-order=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
コード例 #31
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    P.load(temp.name, outfile, options="--add-index=seq_id")
    os.unlink(temp.name)
コード例 #32
0
ファイル: pipeline_motifs.py プロジェクト: wbyu/CGATPipelines
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
コード例 #33
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
コード例 #34
0
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(IOTools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " cgat gtf2gtf"
                 "  --method=sort --sort-order=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
コード例 #35
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("motif\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)
コード例 #36
0
def importFromIterator(
        outfile,
        tablename,
        iterator,
        columns=None,
        indices=None):
    '''import data in *iterator* into *tablename* via temporary file.

    '''

    tmpfile = P.getTempFile(".")

    if columns:
        keys, values = zip(*columns.items())
        tmpfile.write("\t".join(values) + "\n")

    for row in iterator:
        if not columns:
            keys = row[0].keys()
            values = keys
            columns = keys
            tmpfile.write("\t".join(values) + "\n")

        tmpfile.write("\t".join(str(row[x]) for x in keys) + "\n")

    tmpfile.close()

    if indices:
        indices = " ".join("--add-index=%s" % x for x in indices)
    else:
        indices = ""

    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s
    --table=%(tablename)s
    %(indices)s
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
コード例 #37
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write('\t'.join(
                ("target_name", "query_id", "target_id",
                 "optimal_offset", "pvalue", "evalue",
                 "qvalue", "Overlap", "query_consensus",
                 "target_consensus", "orientation")) + "\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
コード例 #38
0
def vennOverlap(dbh, outfiles):
    '''
    use limma to venn diagram the overlap between 
    two conditions
    '''
    cc = dbh.cursor()
    contrasts = set()
    for data in cc.execute("""SELECT contrast FROM differentially_expressed""").fetchall():
        contrasts.add(data[0])

    for cont1, cont2 in itertools.combinations(contrasts, 2):
        result = collections.defaultdict(list)
        temp = P.getTempFile(".")

        for data in cc.execute("""SELECT contrast, probe_id, status FROM differentially_expressed""").fetchall():
            contrast, probe, status = data[0], data[1], data[2]
            if probe in result: continue
            result["probe"].append(probe)
            if status == 2:
                status = 1
            if contrast == cont1:
                result[cont1].append(status)
            elif contrast == cont2:
                result[cont2].append(status)
        temp.write("\t".join(["probe", P.snip(cont1, "_result"), P.snip(cont2, "_result")]) + "\n")

        for data in zip(*[result["probe"], result[cont1], result[cont2]]):
            temp.write("\t".join(map(str, list(data))) + "\n")
        temp.close()
        inf = temp.name
        outf = os.path.join("differential_expression.dir", "%s_vs_%s.venn.pdf" % (cont1, cont2))
        
        R('''
        library("limma")
        dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t", row.names = 1)
        venn.data <- vennCounts(dat)

        pdf("%s")
        vennDiagram(venn.data, circle.col = c(rgb(1,0,0,0.5), rgb(0,1,0,0.5)), lwd = 2)
        dev.off()
        ''' % (inf, outf))

        os.unlink(inf)
コード例 #39
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("method\ttrack\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        method = re.match("(.+).dir/", infile).groups()[0]
        track = os.path.basename(".".join(infile.split(".")[:-1]))
        outf.write("%s\t%s\n" % (method, track))

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
コード例 #40
0
def loadMissedReadCounts(infiles, outfile):
    """load summary table of numbers of missed reads."""

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write("track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #41
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.toTable(outfile)

    resultsdir = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.getTempFile(".")

    # parse the text file
    for line in IOTools.openFile(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
コード例 #42
0
def loadTranscriptSummary(infile, outfile):
    '''summarize binding information per transcript.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    # sqlite can not do full outer join
    cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals())

    transcripts = [
        x[0] for x in cc.execute(
            "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info").
        fetchall()
    ]

    tmpf = P.getTempFile()

    tables = ("tata", "cpg")
    titles = tables

    vals = []
    for table in tables:
        t = set([
            x[0] for x in
            cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" %
                       locals()).fetchall()
        ])
        vals.append(t)

    tmpf.write("transcript_id\t%s\n" % "\t".join(titles))

    for transcript_id in transcripts:
        tmpf.write("%s\t%s\n" % (transcript_id, "\t".join(
            [str(int(transcript_id in v)) for v in vals])))

    tmpf.close()

    P.load(tmpf.name, outfile)
    os.unlink(tmpf.name)
コード例 #43
0
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset,
                           fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.getTempFilename()

    statement = '''
    cgat annotator2tsv \
    --method=fdr-table \
    --fdr-method=%(fdr_method)s \
    --log=%(outfile)s.log \
    --regex-identifier="(.*)%(suffix)s" \
    %(infile)s > %(tmpfilename)s
    '''
    P.run()

    tmpfile = P.getTempFile()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   cgat csv2db %(csv2db_options)s \
    --table=%(table)s
    < %(tmpfilename2)s > %(outfile)s'''

    P.run(**dict(list(locals().items()) + list(PARAMS.items())))
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)
コード例 #44
0
ファイル: pipeline_motifs.py プロジェクト: wbyu/CGATPipelines
def loadMemeChipSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        fn = P.snip(os.path.basename(infile), ".memechip")

        track, npeaks, width, masking = fn.split(".")
        outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) +
                   "\n")

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
コード例 #45
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)
コード例 #46
0
def loadMemeChipSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        fn = P.snip(os.path.basename(infile), ".memechip")

        track, npeaks, width, masking = fn.split(".")
        outf.write(
            "\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n")

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
コード例 #47
0
    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(
                ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError(
                "parsing error in line '%s'" % lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast
コード例 #48
0
def getNumReadsFromBAMFile(infile):
    '''count number of reads in bam file.'''
    # by-passes a problem with pysam, which was reading in stdout as the first
    # elements in list data
    tmpf = P.getTempFile(".")
    tmpfile_name = tmpf.name
    statement = '''samtools idxstats %(infile)s > %(tmpfile_name)s'''

    P.run()

    read_info = IOTools.openFile(tmpfile_name).readlines()
    os.unlink(tmpfile_name)

    try:
        data = sum(map(int, [x.split("\t")[2]
                   for x in read_info if not x.startswith("#")]))

    except IndexError, msg:
        raise IndexError(
            "can't get number of reads from bamfile, msg=%s, data=%s" %
            (msg, read_info))
コード例 #49
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id,
                                       details.matrix_id,
                                       details.position,
                                       details.strand,
                                       details.core_score,
                                       details.matrix_score,
                                       details.sequence])) + "\n")
    temp.close()

    P.load(temp.name,
           outfile,
           options="--add-index=seq_id")
    os.unlink(temp.name)
コード例 #50
0
    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast
コード例 #51
0
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.getTempFilename()

    statement = '''
    python %(scriptsdir)s/annotator2tsv.py \
    --method=fdr-table \
    --fdr-method=%(fdr_method)s \
    --log=%(outfile)s.log \
    --regex-identifier="(.*)%(suffix)s" \
    %(infile)s > %(tmpfilename)s
    '''
    P.run()

    tmpfile = P.getTempFile()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
    --table=%(table)s
    < %(tmpfilename2)s > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)
コード例 #52
0
def loadMissedReadCounts(infiles, outfile):
    '''load summary table of numbers of missed reads.'''
    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write(
        "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" %
                      (track, mapped_genome, _getlines(junctions),
                       _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #53
0
def buildPicardDuplicationStats(infile, outfile):
    '''Record duplicate metrics using Picard, the marked records
    are discarded
    '''

    job_options = getPicardOptions()
    job_threads = 3

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.getTempFile(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    statement += '''MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''

    P.run()

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
コード例 #54
0
def importFromSeries(infiles, outfile):
    '''import expression levels from a GEO series.'''
    tablename = P.toTable(outfile)

    tmpf = P.getTempFile()

    infile_data, infile_map = infiles

    map_header = IOTools.readMap(open(infile_map, "r"))
    if "ID_REF" not in map_header:
        map_header["ID_REF"] = "probeset"

    inf = gzip.open(infile_data, "r")

    for line in inf:
        if line.startswith("!"):
            continue
        if not line.strip():
            continue
        line = re.sub('"', "", line)
        if line.startswith("ID_REF"):
            line = "\t".join([map_header[x]
                              for x in line[:-1].split("\t")]) + "\n"

        tmpf.write(line)

    tmpf.close()
    tmpname = tmpf.name

    header = map_header["ID_REF"]
    statement = '''
   cgat csv2db %(csv2db_options)s \
              --add-index=%(header)s \
              --table=%(tablename)s \
    < %(tmpname)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpname)
コード例 #55
0
def loadStrandSpecificity(infiles, outfile, suffix="strand", tablename=None):
    '''
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    table_count = 0
    table_join = None

    for infile in infiles:
        name = P.snip(os.path.basename(infile), ".strand")

        table = pd.read_csv(infile, sep="\t")
        table["track"] = name

        if table_count == 0:
            table_join = table
            table_count += 1
        else:
            table_join = table.merge(table_join,
                                     on=[
                                         "MSR", "ISR", "OSR", "ISF", "MSF",
                                         "OSF", "SF", "SR", "track"
                                     ],
                                     how="outer")

    table_join.to_csv(outf, sep="\t", index=False)

    outf.close()

    P.load(infile=outf.name,
           outfile=outfile,
           tablename=tablename,
           options="--add-index=track")

    os.unlink(outf.name)