Beispiel #1
0
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.toTable(outfile)

    tmpfile = P.getTempFile(".")

    tmpfile.write(MAST.Match().header +
                  "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(
                ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups()
        except AttributeError:
            raise P.PipelineError(
                "parsing error in line '%s'" % lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (
                match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match(
                "(\S+):(\d+)..(\d+)", match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (
                    float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0)
            if "r" not in controls[id]:
                controls[id]["r"] = (
                    float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
                          (motif_fg, contig,
                           "\t".join(map(str, controls[id]["l"])),
                           "\t".join(map(str, controls[id]["r"])),
                           str(min_evalue),
                           str(min_pvalue),
                           str(max_nmatches),
                           ) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              -b sqlite 
              --index=id 
              --index=motif 
              --index=id,motif 
              --table=%(tablename)s 
              --allow-empty
              --map=base_qualities:text 
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Beispiel #2
0
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.toTable(outfile)

    tmpfile = P.getTempFile(".")

    tmpfile.write( MAST.Match().header +\
                   "\tmotif\tcontig" \
                   "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" \
                   "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" \
                   "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n" )

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % \
                               (motif_fg, contig,
                                "\t".join( map(str, controls[id]["l"] )),
                                "\t".join( map(str, controls[id]["r"] )),
                                str(min_evalue),
                                str(min_pvalue),
                                str(max_nmatches),
                                ) + "\n" )

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              -b sqlite 
              --index=id 
              --index=motif 
              --index=id,motif 
              --table=%(tablename)s 
              --allow-empty
              --map=base_qualities:text 
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Beispiel #3
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tablename = outfile[:-len(".load")]
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(
                ":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise P.PipelineError(
                "parsing error in line '%s'" % lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.iteritems():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            #start, end = int(start), int(end)
            #match.start += start
            #match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0:
                mmax = ""
            else:
                mmax = max(c)

            tmpfile.write("\t".join(map(str,
                                        (motif, id,
                                         nmatches,
                                         score,
                                         ",".join(map(str, scores)),
                                         len(c),
                                         mmax))) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              -b sqlite \
              --index=id \
              --index=motif \
              --index=id,motif \
              --table=%(tablename)s \
              --map=base_qualities:text \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Beispiel #4
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tablename = outfile[:-len(".load")]
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(":: motif = (\S+) ::",
                             lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.iteritems():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            #start, end = int(start), int(end)
            #match.start += start
            #match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0: mmax = ""
            else: mmax = max(c)

            tmpfile.write("\t".join(
                map(str, (motif, id, nmatches, score,
                          ",".join(map(str, scores)), len(c), mmax))) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              -b sqlite \
              --index=id \
              --index=motif \
              --index=id,motif \
              --table=%(tablename)s \
              --map=base_qualities:text \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Beispiel #5
0
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print "\t"
        print track
Beispiel #6
0
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print "\t"
        print track
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info("importing expression data from %s" % infile)

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute(statement)
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader(open(infile, "rU"))

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False)

    headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"),
               ("mRna - Description", "description"), ('mRNA Accession',
                                                       'mrna_id'),
               ('mRNA  Source', 'source'), ('mRNA - xhyb', 'xhyb'),
               ('GO Biological Process ID',
                'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'),
               ('GO Cellular Component ID',
                'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'),
               ('GO Molecular Function ID',
                'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'),
               ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name'))

    old_headers = set([x[0] for x in headers])
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open(outfiles[0] + suffix, "w")
    outf.write("# %s\n" % infile)
    outs = open(outfiles[1] + suffix, "w")
    outs.write("# %s\n" % infile)

    writer = csv.writer(outf)

    for row in reader:
        if first:
            first = False
            writer.writerow(row)

            for x, old_header in enumerate(row):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append(x)

            # write headers to all files
            outs.write("\t".join(new_headers) + "\n")

            for exp, columns in map_exp2columns.items():
                output_files.write(
                    exp, "\t".join(
                        ("cluster_id", Stats.Summary().getHeader(), "\t".join(
                            ["R%i" % i for i in range(len(columns))]))) + "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1

            if probeset not in map_cluster2transcript:
                writer.writerow(row)
                counts.skipped += 1
                continue
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join(new_row) + "\n")

            for exp, cols in map_exp2columns.items():
                data = [row[x] for x in cols]
                output_files.write(
                    exp, "\t".join(
                        (probeset, str(Stats.Summary(
                            [float(x)
                             for x in data])), "\t".join(data))) + "\n")

    outf.close()
    if counts.duplicates > 0:
        P.warn("duplicate probeset/clusters")

    P.info("probeset source information: %s" % str(counts))
    output_files.close()
def buildExpressionTracks( infile, outfiles, map_exp2columns, suffix ):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info( "importing expression data from %s" % infile )
    
    dbhandle = sqlite3.connect( PARAMS["database"] )

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute( statement )
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader( open(infile,"rU") )

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool( output_pattern = "exp%s.data", force = False )

    headers = (
        ("Probe Set ID", "cluster_id"),
        ("Gene Symbol", "genesymbol"),
        ("mRna - Description", "description"),
        ('mRNA Accession',  'mrna_id'),
        ('mRNA  Source', 'source' ),
        ('mRNA - xhyb', 'xhyb'),
        ('GO Biological Process ID', 'go_biol_id'),
        ('GO Biological Process Term', 'go_biol_term'),    
        ('GO Cellular Component ID', 'go_cell_id'),  
        ('GO Cellular Component Term', 'go_cell_term'), 
        ('GO Molecular Function ID', 'go_mol_id'),
        ('GO Molecular Function Term', 'go_mol_term'),
        ('Pathway Source', 'pw_source' ),       
        ('Pathway Name', 'pw_name' ) )

    old_headers = set( [x[0] for x in headers] )
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open( outfiles[0] + suffix, "w")
    outf.write( "# %s\n" % infile )
    outs = open( outfiles[1] + suffix, "w")
    outs.write( "# %s\n" % infile )
    
    writer = csv.writer( outf )

    for row in reader:
        if first:
            first = False
            writer.writerow( row )

            for x, old_header in enumerate(row ):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append( x )

            # write headers to all files
            outs.write("\t".join(new_headers)+ "\n")

            for exp,columns in map_exp2columns.items():
                output_files.write( exp, 
                                    "\t".join( ("cluster_id",
                                                Stats.Summary().getHeader(),
                                                "\t".join(["R%i" % i for i in range(len(columns))])))+ "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1
                
            if probeset not in map_cluster2transcript:
                writer.writerow( row )
                counts.skipped += 1
                continue 
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join( new_row )+ "\n")

            for exp,cols in map_exp2columns.items():
                data = [row[x] for x in cols ]
                output_files.write( exp, "\t".join( (probeset,
                                                     str(Stats.Summary([float(x) for x in data ])),
                                                     "\t".join( data ) )) + "\n" )

                
    outf.close()
    if counts.duplicates > 0:
        P.warn( "duplicate probeset/clusters" )

    P.info( "probeset source information: %s" % str(counts) )
    output_files.close()