コード例 #1
0
ファイル: __init__.py プロジェクト: BioXiao/cgat
def dtwWrapper(data, rows, columns, k):
    '''
    wrapper function for dynamic time warping.
    includes use of exponential adaptive tuning function
    with temporal correlation if k > 0
    '''

    # not explicitly called, but needs to be in R environment
    DTW = importr("dtw")

    # create a data frame of zeros of size number of ids x number of ids
    # fill it with the calculated distance metric for each pair wise comparison

    df_ = pd.DataFrame(index=rows,
                       columns=columns)
    df_ = df_.fillna(0.0).astype(np.float64)

    # fill the array with dtw-distance values
    pandas2ri.activate()

    for i in rows:
        E.info("DTW %s" % i)
        for j in columns:
            series1 = data.loc[i].values.tolist()
            series2 = data.loc[j].values.tolist()
            DTW_value = (R.dtw(series1,
                               series2)).rx('distance')[0][0]
            cort_value = temporalCorrelate(series1, series2)
            tuned_value = adaptiveTune(cort_value, k)
            time_dist = DTW_value * tuned_value
            df_.loc[i][j] = float(time_dist)
            df_[j][i] = float(time_dist)

    return df_
コード例 #2
0
def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError(
                "required file %s for %s (stage %i) not exist." % (filename, outfile, stage))

        if stage == 0:
            statement.append( '''gunzip < %(filename)s''' % locals() )
        else:
            statement.append( '''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals() )

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()
コード例 #3
0
ファイル: CSV2DB.py プロジェクト: CGATOxford/cgat
def executewait(dbhandle, statement, error, retry=False, wait=5, args=()):
    """execute sql statement.

    Retry on error, if retry is True.
    Returns a cursor object.
    """

    cc = dbhandle.cursor()
    i = 20
    while i > 0:
        try:
            cc.execute(statement, args)
            return cc
        except sqlite3.OperationalError as e:
            msg = e.message
            E.warn("import failed: msg=%s, statement=\n  %s" % (msg, statement))
            # TODO: check for database locked msg
            if not retry:
                raise e
            if not re.search("locked", str(msg)):
                raise e
            time.sleep(wait)
            i -= 1
            continue
        break
    raise sqlite3.OperationalError("Database locked and too many retries")
コード例 #4
0
ファイル: __init__.py プロジェクト: BioXiao/cgat
def correlateDistanceMetric(data, rows, columns, method, lag=0):
    '''
    wrapper for correlation coefficients as distance metrics
    for time-series clustering.
    Use either temporal correlation (analagous to template matching)
    or normalised cross correlation.
    '''

    # create blank (all 0's) dataframe to fill with correlation values

    df_ = pd.DataFrame(index=rows,
                       columns=columns)
    df_ = df_.fillna(0.0)

    if method == "cross-correlate":
        for i in rows:
            E.info("cross-correlation %s" % i)
            for j in columns:
                series1 = data.loc[i].values.tolist()
                series2 = data.loc[j].values.tolist()
                corr = crossCorrelate(series1, series2, lag=lag)
                df_.loc[i][j] = 1.0 - abs(corr)
                df_[j][i] = 1.0 - abs(corr)

    elif method == "temporal-correlate":
        for i in rows:
            E.info("temporal correlation %s" % i)
            for j in columns:
                series1 = data.loc[i].tolist()
                series2 = data.loc[j].tolist()
                corr = temporalCorrelate(series1, series2)
                df_.loc[i][j] = 1.0 - abs(corr)
                df_[j][i] = 1.0 - abs(corr)

    return df_
コード例 #5
0
    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)

            statement = '''gunzip < %(infile)s
                 | cgat maf2psl
                      --query=%(track)s
                      --target=%(maf_master)s
                      --log=%(outfile)s.log
                 | cgat psl2psl
                      --method=filter-fasta
                      --method=sanitize
                      --queries-tsv-file=%(genomefile)s
                      --target-psl-file=%(genome)s
                      --log=%(outfile)s.log
                 | gzip
                 >> %(outfile)s
                 '''
            P.run()
コード例 #6
0
ファイル: PipelineMotifs.py プロジェクト: lesheng/cgat
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
コード例 #7
0
ファイル: PipelineMotifs.py プロジェクト: lesheng/cgat
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
コード例 #8
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()
コード例 #9
0
ファイル: fasta2bed.py プロジェクト: siping/cgat
def segmentWithCpG( infile, options ):
    '''segment a fasta file, output locations of CpG.'''

    ninput, nskipped, noutput = 0, 0, 0
    
    iterator = FastaIterator.FastaIterator( infile )

    segments = []
    
    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        if cur_record is None: break
        ninput += 1
        contig = re.sub("\s.*", "", cur_record.title )
        last = None
        for pos, this in enumerate( cur_record.sequence.upper()):
            if last == "C" and this == "G":
                segments.append( (contig, pos - 1, pos + 1, 1.0))
            last = this

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    return segments
コード例 #10
0
ファイル: Counts.py プロジェクト: CGATOxford/cgat
def findClusters(df, distance, size, tracks_map, groups):
    '''define clusters of genomic loci depending on thresholds for
    size and minimum members per cluster
    This was written with CpGs in mind but will work with any data frame
    containing "position" and "contig" columns'''

    positions = df['position'].tolist()
    contigs = df['contig'].tolist()
    current_pos = 0
    cluster_ix = []
    current_contig = ""
    cluster_dfs = {}
    n = 0
    for ix in range(0, len(positions)):
        next_pos = positions[ix]
        next_contig = contigs[ix]
        if (((next_pos < current_pos + distance) &
             (next_contig == current_contig))):
            cluster_ix.append(ix)
        else:
            if len(cluster_ix) >= size:
                start, end = (cluster_ix[0], cluster_ix[-1] + 1)
                cluster_dfs[n] = df.iloc[start:end]
                n += 1
            cluster_ix = []
            current_pos = next_pos
            current_contig = next_contig

    E.info("found %i clusters" % n)
    return (cluster_dfs)
コード例 #11
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)
コード例 #12
0
def mergeExpectedAndObservedGenomeCoverage(infiles, outfile):
    '''
    merge the expected and actual estimates
    of genome coverage
    '''
 
    expected = open(infiles[0])
    expected_header = expected.readline()
    observed = open(infiles[1])
    observed_header = observed.readline()

    expected_data = {}
    E.info("reading expected coverage over genomes")
    for line in expected.readlines():
        data = line[:-1].split("\t")
        gi, coverage = data[0], data[1]
        expected_data[gi] = coverage

    outf = open(outfile, "w")
    E.info("writing results")
    outf.write("track\tgi\tspecies\tobserved\texpected\n")
    for line in observed.readlines():
        data = line[:-1].split("\t")
        track, gi, species, coverage = data[0], data[1], "_".join(data[2].split("_")[5:7]), data[3]
        outf.write("%s\t%s\t%s\t%s\t%s\n" % (track, gi, species, coverage, expected_data[gi]))
    outf.close()
コード例 #13
0
ファイル: farm.py プロジェクト: gjaime/CGATPipelines
    def __call__(self, filenames, outfile, options):

        for fi, fn in filenames:
            E.debug("# merging %s" % fn)
            infile = IOTools.openFile(fn, "r")

            if options.output_header:
                self.parseHeader(infile, outfile, options)

            for l in infile:
                nfields = l.count("\t")

                if l[0] == "#":
                    options.stdlog.write(l)
                elif self.nfields is not None and nfields != self.nfields:
                    # validate number of fields in row, raise warning
                    # for those not matching and skip.
                    E.warn(
                        "# line %s has unexpected number of fields: %i != %i" %
                        (l[:-1], nfields, self.nfields))
                else:
                    if self.mFieldIndex is not None:
                        data = l[:-1].split("\t")
                        try:
                            data[self.mFieldIndex] = self.mMapper(
                                fi, data[self.mFieldIndex])
                        except IndexError:
                            raise IndexError(
                                "can not find field %i in %s" %
                                (self.mFieldIndex, l))
                        l = "\t".join(data) + "\n"

                    outfile.write(l)
            infile.close()
コード例 #14
0
ファイル: bed2bed.py プロジェクト: lesheng/cgat
def sanitizeGenome(iterator, contigs):
    """truncate bed intervals that extend beyond contigs.

    removes empty intervals (start == end).

    throws an error if start > end.
    """

    ninput, noutput = 0, 0
    ntruncated_contig, nskipped_contig, nskipped_empty = 0, 0, 0

    for bed in iterator:
        ninput += 1
        if bed.contig not in contigs:
            nskipped_contig += 1
            continue
        # IMS: changing >= to > in if statement: next line sets bed.end = contigs[bed.contig]
        # this shouldn't count as a truncation.
        if bed.end > contigs[bed.contig]:
            bed.end = contigs[bed.contig]
            ntruncated_contig += 1
        if bed.start < 0:
            bed.start = 0
            ntruncated_contig += 1
        if bed.start == bed.end:
            nskipped_empty += 1
            continue
        elif bed.start > bed.end:
            raise ValueError("invalid interval: start > end for %s" % str(bed))

        noutput += 1
        yield bed

    E.info("ninput=%i, noutput=%i, nskipped_contig=%i, ntruncated=%i, nskipped_empty=%i" %
           (ninput, noutput, nskipped_contig, ntruncated_contig, nskipped_empty))
コード例 #15
0
ファイル: bed2bed.py プロジェクト: lesheng/cgat
def extendInterval(iterator, distance):

    ninput, noutput, nskipped = 0, 0, 0
    for bed in iterator:
        ninput += 1

        if bed.contig not in contigs:
            nskipped_contig += 1
            continue
        if bed.start < 0 or bed.end < 0:
            nskipped_range += 1
            continue
        if bed.end > contigs[bed.contig]:
            nskipped_range += 1
            continue

        newstart = bed.start - distance
        newend = bed.end + distance

        if newstart < 0:
            newstart = 0

        if newend > contigs[bed.contig]:
            newend = contigs[bed.contig]

        bed.start = newstart
        bed.end = newend

        noutput += 1
        yield bed

    E.info("ninput = %i, noutput=%i, nskipped=%i" %
           (ninput, noutput, nskipped))
コード例 #16
0
ファイル: combine_tables.py プロジェクト: yangjl/cgat
def readTable( filename, options):
    '''read table and filter.
    '''

    if os.path.exists(filename):
        lines = IOTools.openFile(filename, "r").readlines()
    else: 
        lines = []

    # extract table by regular expression
    if options.regex_start:
        rx = re.compile(options.regex_start)
        for n, line in enumerate(lines):
            if rx.search(line): 
                E.info("reading table from line %i/%i" % (n,len(lines)))
                lines = lines[n:]
                break
        else:
            E.info("start regex not found - no table")
            lines = []

    if options.regex_end:
        rx = re.compile(options.regex_end)
        for n, line in enumerate(lines):
            if rx.search(line): break
        lines = lines[:n]

    # remove comments and empty lines
    lines = [ x for x in lines if not x.startswith("#") and x.strip()]

    return lines
コード例 #17
0
ファイル: bed2bed.py プロジェクト: lesheng/cgat
def filterGenome(iterator, contigs):
    """remove bed intervals that are outside of contigs.

    contigs is a dictionary of contig sizes."""

    ninput, noutput = 0, 0
    nskipped_contig, nskipped_range, nskipped_endzero = 0, 0, 0

    for bed in iterator:
        ninput += 1
        if bed.contig not in contigs:
            nskipped_contig += 1
            continue
        # IMS: add filtering for filtering <0 co-ordinates
        if bed.start < 0 or bed.end < 0:
            nskipped_range += 1
            continue
        # should this not be just >, as co-ordinates are half-closed, so
        # if end = contigs[bed.contig], then interval ends on last base?
        if bed.end > contigs[bed.contig]:
            nskipped_range += 1
            continue
        if bed.end == 0:
            nskipped_endzero += 1
            continue
        noutput += 1
        yield bed

    E.info("ninput=%i, noutput=%i, nskipped_contig=%i, nskipped_range=%i, nskipped_endzero=%i" %
           (ninput, noutput, nskipped_contig, nskipped_range, nskipped_endzero))
コード例 #18
0
ファイル: PipelineUCSC.py プロジェクト: sudlab/CGATPipelines
def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    cc = dbhandle.cursor()
    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc.execute(sql)
        outfile = IOTools.openFile(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        P.touch(outfile)
コード例 #19
0
ファイル: fasta2bed.py プロジェクト: Charlie-George/cgat
def segmentWithCpG(infile, with_contig_sizes=False):
    '''segment a fasta file, output locations of CpG.'''

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)

    segments, contig_sizes = [], collections.OrderedDict()

    for cur_record in iterator:
        ninput += 1
        contig = re.sub("\s.*", "", cur_record.title)
        last = None
        contig_sizes[contig] = (0, len(cur_record.sequence))
        for pos, this in enumerate(cur_record.sequence.upper()):
            if last == "C" and this == "G":
                segments.append((contig, pos - 1, pos + 1, 1.0))
            last = this

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    if with_contig_sizes:
        return segments, contig_sizes

    return segments
コード例 #20
0
    def buildRawGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from maf files.
        '''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            # skip maf files without Hsap on top.
            if "other" in infile or "supercontig" in infile:
                continue

            E.info("adding %s" % infile)

            genome_query, genome_target = getGenomes()

            statement = '''gunzip < %(infile)s 
             | python %(scriptsdir)s/maf2psl.py
                  --query=%(maf_name_query)s
                  --target=%(maf_name_target)s
                  --log=%(outfile)s.log
             | python %(scriptsdir)s/psl2psl.py
                  --method=filter-fasta
                  --method=sanitize
                  --queries-tsv-file=%(genome_query)s
                  --target-psl-file=%(genome_target)s
                  --log=%(outfile)s.log
             | gzip
             >> %(outfile)s
             '''
            P.run()
コード例 #21
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''gather BAM file alignment statistics using Picard '''

    job_options = getPicardOptions()
    job_threads = 3

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s
    | python %(scriptsdir)s/bam2bam.py -v 0
    --method=set-sequence --output-sam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()
コード例 #22
0
ファイル: farm.py プロジェクト: jmadzo/cgat
    def parseHeader(self, infile, outfile, options):
        """parse header in infile."""
        # skip comments until header
        while 1:
            l = infile.readline()
            if not l or l[0] != "#":
                break
            options.stdlog.write(l)

        # Print only the first header and check if
        # all the headers are the same.
        if self.mHeader:
            if self.mHeader != l:
                raise ValueError(
                    "inconsistent header in file %s\ngot=%s\nexpected=%s" % (infile, l, self.mHeader))
        else:
            outfile.write(l)
            self.mHeader = l

            if self.mFieldIndex is None and self.mFieldName:
                try:
                    self.mFieldIndex = self.mHeader.split(
                        "\t").index(self.mFieldName)
                except ValueError:
                    E.warn("no mapping, can not find field %s in %s" %
                           (self.mFieldName, self.mHeader))
                    self.mFieldName = None

                E.debug(
                    "substituting field: %s, %s" %
                    (self.mFieldName, self.mFieldIndex))
コード例 #23
0
ファイル: diff_gtf.py プロジェクト: SCV/cgat
    def count(self, filename1, filename2):
        """count overlap between two gtf files."""

        E.info("counting started for %s versus %s" % (filename1, filename2))

        idx2 = self.buildIndex(filename2)

        (self.mGenes1, self.mGenesOverlapping1,
         self.mExons1, self.mExonsOverlapping1,
         self.mBases1, self.mBasesOverlapping1 ) = \
            self._count(filename1, idx2)

        self.mGenesUnique1 = self.mGenes1 - self.mGenesOverlapping1
        self.mExonsUnique1 = self.mExons1 - self.mExonsOverlapping1
        self.mBasesUnique1 = self.mBases1 - self.mBasesOverlapping1

        idx1 = self.buildIndex(filename1)

        (self.mGenes2, self.mGenesOverlapping2,
         self.mExons2, self.mExonsOverlapping2,
         self.mBases2, self.mBasesOverlapping2 ) = \
            self._count(filename2, idx1)

        self.mGenesUnique2 = self.mGenes2 - self.mGenesOverlapping2
        self.mExonsUnique2 = self.mExons2 - self.mExonsOverlapping2
        self.mBasesUnique2 = self.mBases2 - self.mBasesOverlapping2
コード例 #24
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
コード例 #25
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
コード例 #26
0
ファイル: psl2psl.py プロジェクト: Q-KIM/cgat
def pslSelectQuery(options):

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    value, field = options.select.split("-")

    if field == "nmatches":
        f = lambda x: x.mNMatches
    elif field == "nmismatches":
        f = lambda x: x.mNMisMatches

    for data in Blat.iterator_per_query(Blat.iterator(options.stdin)):

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        data.sort(key=f)

        if value == "most":
            options.stdout.write("%s\n" % str(data[-1]))
        elif value == "least":
            options.stdout.write("%s\n" % str(data[0]))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
コード例 #27
0
ファイル: psl2psl.py プロジェクト: Q-KIM/cgat
def iterator_rename_query(infile, options):

    ninput, noutput, nerrors = 0, 0, 0

    map_old2new = {}
    x = 1
    while 1:

        match = infile.next()

        if not match:
            break
        ninput += 1

        if match.mQueryId not in map_old2new or options.unique:
            new = options.id_format % x
            map_old2new[match.mQueryId] = new
            x += 1
        else:
            new = map_old2new[match.mQueryId]
            match.mQueryId = new
        yield match

    if options.output_filename_map:
        outfile = open(options.output_filename_map, "w")
        outfile.write("%s\t%s\n" % ("old", "new"))
        for old, new in map_old2new.iteritems():
            outfile.write("%s\t%s\n" % (old, new))
        outfile.close()

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))
コード例 #28
0
ファイル: Local.py プロジェクト: sudlab/CGATPipelines
 def _copy(src, dest):
     if os.path.exists(dest):
         shutil.rmtree(dest)
     if not os.path.exists(src):
         E.warn("%s does not exist - skipped" % src)
         return
     shutil.copytree(os.path.abspath(src), dest)
コード例 #29
0
ファイル: psl2psl.py プロジェクト: Q-KIM/cgat
def pslAddSequence(query_fasta, sbjct_fasta, options):

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        new = Blat.MatchPSLX()
        new.fromPSL(match,
                    query_fasta.getSequence(
                        match.mQueryId, "+", match.mQueryFrom, match.mQueryTo),
                    sbjct_fasta.getSequence(
                        match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo))

        options.stdout.write(str(new) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
コード例 #30
0
ファイル: farm.py プロジェクト: jmadzo/cgat
    def __call__(self, filenames, outfile, options):

        for fi, fn in filenames:
            E.debug("# merging %s" % fn)
            infile = IOTools.openFile(fn, "r")

            if options.output_header:
                self.parseHeader(infile, outfile, options)

            for l in infile:
                if l[0] == "#":
                    options.stdlog.write(l)
                else:
                    if self.mFieldIndex is not None:
                        data = l[:-1].split("\t")
                        try:
                            data[self.mFieldIndex] = self.mMapper(
                                fi, data[self.mFieldIndex])
                        except IndexError:
                            raise IndexError(
                                "can not find field %i in %s" %
                                (self.mFieldIndex, l))
                        l = "\t".join(data) + "\n"

                    outfile.write(l)
            infile.close()
コード例 #31
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    psl = None

    def chain_iterator(infile):
        lines = []
        for line in options.stdin:

            if line.startswith("#"):
                continue
            if line.strip() == "":
                continue
            if line.startswith("chain"):
                if lines:
                    yield lines
                lines = []
            lines.append(line)

        yield lines

    for lines in chain_iterator(options.stdin):

        ninput += 1
        psl = Blat.Match()

        (_,
         _,
         psl.mSbjctId,
         target_length,
         target_strand,
         target_start,
         target_end,
         psl.mQueryId,
         query_length,
         query_strand,
         query_start,
         query_end,
         alignment_id) = lines[0][:-1].split()

        (psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength,
         psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength) = \
            [int(x) for x in
             (query_start,
              query_end,
              query_length,
              target_start,
              target_end,
              target_length)]

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        qstart, tstart = psl.mQueryStart, psl.mSbjctStart

        for line in lines[1:-1]:
            size, dt, dq = [int(x) for x in line[:-1].split()]
            map_query2target.addDiagonal(qstart,
                                         qstart + size,
                                         tstart - qstart)
            qstart += size + dq
            tstart += size + dt

        size = int(lines[-1][:-1])

        map_query2target.addDiagonal(qstart,
                                     qstart + size,
                                     tstart - qstart)

        psl.fromMap(map_query2target)

        # sort out strand
        # target_strand is always positive
        assert(target_strand == "+")

        # if query strand is negative
        if query_strand == "-":
            # invert both query and target
            psl.switchTargetStrand()
            # manually invert the query coordinates
            psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - \
                psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom

        options.stdout.write("%s\n" % psl)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
コード例 #32
0
def iterate_guess(infile, max_tries=10000, guess=None):
    '''iterate over contents of fastq file.

    Guess quality format by looking at the first `max_tries` entries and
    then subsequently setting the quality score format for each entry.

    Arguments
    ---------
    infile : File
       File or file-like object to iterate over
    max_tries : int
       Number of records to examine for guessing the quality score
       format.
    guess : string
       Default format. This format will be chosen in the quality
       score format is ambiguous. The method checks if the `guess`
       is compatible with the records read so far.

    Yields
    ------
    fastq
        An object of type :class:`Record`.

    Raises
    ------
    ValueError
        If the ranges of the fastq records are not compatible,
        are incompatible with guess or are ambiguous.

    '''
    quals = set(RANGES.keys())
    cache = []
    myiter = iterate(infile)
    lengths = []
    for c, record in enumerate(myiter):
        quals.intersection_update(set(record.guessFormat()))
        if len(quals) == 0:
            raise ValueError("could not guess format - ranges incompatible.")
        if len(quals) == 1:
            break
        cache.append(record)
        lengths.append(len(record.seq))
        if c > max_tries:
            break

    if len(quals) == 1:
        ref_format = list(quals)[0]
    elif guess in quals:
        E.warn("multiple input formats possible: %s. Continuing with %s" %
               (", ".join(quals), guess))
        ref_format = guess
    elif quals.issubset(set(["solexa", "phred64"])):
        # guessFormat will call phred64 reads as phred64 AND solexa
        # if both still remain after max_tries, assume phred64
        ref_format = "phred64"
    else:
        raise ValueError("could not guess format - could be one of %s." %
                         str(quals))

    for r in cache:
        r.format = ref_format
        yield r

    for r in myiter:
        r.format = ref_format
        yield r
コード例 #33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--source", dest="source_directory",
                      type="string", default=False,
                      help="The directory in which data"
                      "files are held [%default]")

    parser.add_option("-d", "--dest", dest="dest_directory",
                      type="string", default=False,
                      help="The directory in which links"
                      "are created [%default]")

    parser.set_defaults(source_directory=None,
                        dest_directory=".")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read a map of input files to links with sanity checks
    map_filename2link = {}
    links = set()
    for line in options.stdin:
        if line.startswith("#"):
            continue

        # ignore header
        if line.startswith("source"):
            continue

        filename, link = line[:-1].split()[:2]
        if filename in map_filename2link:
            raise ValueError("duplicate filename '%s' " % filename)
        if link in links:
            raise ValueError("duplicate link '%s' " % link)
        map_filename2link[filename] = link
        links.add(link)

    counter = E.Counter()
    counter.input = len(map_filename2link)

    def _createLink(src, dest, counter):
        src = os.path.abspath(src)
        dest = os.path.abspath(os.path.join(options.dest_directory, dest))
        if os.path.exists(dest):
            E.warn("existing symlink %s" % dest)
            counter.link_exists += 1
        elif not os.path.exists(src):
            counter.file_not_found += 1
            E.warn("did not find %s" % src)
        else:
            try:
                os.symlink(src, dest)
                counter.success += 1
            except OSError:
                pass

    if not options.source_directory:
        # no source directory given, filenames must have complete path
        for filename, link in map_filename2link.items():
            _createLink(filename, link, counter)
    else:
        # walk through directory hierchy and create links
        # for files matching filenames in map_filename2link
        found = set()
        for dirName, subdirList, fileList in os.walk(options.source_directory):
            for f in fileList:
                if f in map_filename2link:
                    if f in found:
                        E.warn("found multiple files with "
                               "the same name %s" % f)
                    else:
                        _createLink(os.path.join(dirName, f),
                                    map_filename2link[f], counter)
                        found.add(f)
                else:
                    E.info("Filename %s not in map" % f)

        notfound = set(map_filename2link.keys()).difference(found)
        counter.notfound = len(notfound)
        if notfound:
            E.warn("did not find %i files: %s" % (len(notfound),
                                                  str(notfound)))

    E.info(counter)
    # write footer and output benchmark information
    E.Stop()
コード例 #34
0
def publish():
    '''publish report and data.'''

    E.info("publishing report")
    P.publish_report()
コード例 #35
0
def update_report():
    '''update report.'''

    E.info("updating report")
    P.run_report(clean=False)
コード例 #36
0
def build_report():
    '''build report from scratch.'''

    E.info("starting report build process from scratch")
    P.run_report(clean=True)
コード例 #37
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "remove-stops", "upper", "lower", "reverse-complement",
                 "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.set_defaults(
        methods=[],
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        ignore_errors=False,
        exclude_pattern=None,
        include_pattern=None,
        sample_proportion=None,
        filter_methods=[],
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    if "apply-map" in options.methods:
        map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    ninput, noutput, nerrors, nskipped = 0, 0, 0, 0

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.openFile(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        if cur_record is None:
            break
        nseq += 1
        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(cur_record.title):
            nskipped += 1
            continue

        if rx_exclude and rx_exclude.search(cur_record.title):
            nskipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or cur_record.title in filter_id_list):
            nskipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        cur_record.title, ls)
                    nerrors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = other_iterator.next()
                except StopIteration:
                    raise "run out of sequences."

                if cur_record.title != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        cur_record.title, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = string.translate(
                    sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = hard_masked_iterator.next()
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (cur_record.title))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in itertools.izip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in xrange(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                if id in map_seq2nid:
                    rest = cur_record.title[len(id):]
                    cur_record.title = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                cur_record.title = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len(filter(lambda x: x in mask_chars, seq[x:x + 3]))
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = other_iterator.next()

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if cur_record.title != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (cur_record.title, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        % (cur_record.title, len(other_sequence) * 3,
                           len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            nskipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            nskipped += 1
            continue

        options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence))
        noutput += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = open(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in map_seq2nid.items():
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" %
           (ninput, noutput, nskipped, nerrors))

    E.Stop()
コード例 #38
0
ファイル: matrix2matrix.py プロジェクト: wangdi2014/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "normalize-by-min-diagonal",
                          "normalize-by-column",
                          "log",
                          "ln",
                          "negzero2value",
                          "set-diagonal",
                          "subtract-matrix",
                          "mix-matrix",
                          "normalize-by-matrix",
                          "normalize-by-column-max",
                          "normalize-by-row-max",
                          "normalize-by-column-min",
                          "normalize-by-row-min",
                          "normalize-by-column-median",
                          "normalize-by-row-median",
                          "normalize-by-column-mean",
                          "normalize-by-row-mean",
                          "normalize-by-column-total",
                          "normalize-by-row-total",
                          "correspondence-analysis",
                          "normalize-by-value",
                          "add-value",
                          "sort-rows",
                          "sort-columns",
                          "transpose",
                          "upper-bound",
                          "lower-bound",
                          "subtract-first-col",
                          "multiply-by-value",
                          "divide-by-value",
                          "mask-rows",
                          "mask-columns",
                          "mask-rows-and-columns",
                          "symmetrize-mean",
                          "symmetrize-max",
                          "symmetrize-min",
                      ),
                      help="""method to use [default=%default]""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by [default=%default].")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default=%default].")

    parser.add_option("--rows-tsv-file",
                      dest="filename_rows",
                      type="string",
                      help="filename with rows to mask [default=%default].")

    parser.add_option("--columns-tsv-file",
                      dest="filename_columns",
                      type="string",
                      help="filename with columns to mask [default=%default].")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--header-names",
                      dest="headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers",
                      dest="headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix.""")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix.""")

    parser.add_option(
        "--missing-value",
        dest="missing",
        type="float",
        help=
        "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]."
    )

    parser.set_defaults(
        methods=[],
        scale=1.0,
        headers=True,
        format="%6.4f",
        output_format="full",
        input_format="full",
        value=0.0,
        parameters="",
        write_separators=True,
        filename_rows=None,
        filename_columns=None,
        missing=None,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    if len(lines) == 0:
        raise IOError("no input")

    chunks = [x for x in range(len(lines)) if lines[x][0] == ">"]

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    if options.filename_rows:
        row_names, n = IOTools.ReadList(open(options.filename_rows, "r"))
    if options.filename_columns:
        column_names, n = IOTools.ReadList(open(options.filename_columns, "r"))

    for chunk in range(len(chunks) - 1):

        try:
            raw_matrix, row_headers, col_headers = MatlabTools.readMatrix(
                StringIO("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])),
                format=options.input_format,
                headers=options.headers,
                missing=options.missing)
        except ValueError as msg:
            E.warn("matrix could not be read: %s" % msg)
            continue

        nrows, ncols = raw_matrix.shape

        E.debug("read matrix: %i x %i, %i row titles, %i colum titles" %
                (nrows, ncols, len(row_headers), len(col_headers)))

        parameter = 0

        for method in options.methods:

            matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape)

            if method in ("normalize-by-matrix", "subtract-matrix",
                          "mix-matrix", "add-matrix"):

                other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix(
                    open(options.parameters[parameter], "r"),
                    headers=options.headers)

                other_nrows, other_ncols = other_matrix.shape

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n"
                        % (options.parameters[parameter], other_nrows,
                           other_ncols, len(other_row_headers),
                           len(other_col_headers)))

                parameter += 1

            elif method == "normalize-by-min-diagonal":
                for x in range(nrows):
                    for y in range(ncols):
                        m = min(raw_matrix[x, x], raw_matrix[y, y])
                        if m > 0:
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "normalize-by-column":
                if nrows != ncols:
                    raise ValueError("only supported for symmeric matrices")

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y]

            elif method == "normalize-by-value":
                matrix = raw_matrix / float(options.parameters[parameter])
                parameter += 1

            elif method == "normalize-by-row":
                if nrows != ncols:
                    raise ValueError("only supported for symmeric matrices")

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x]

            elif method == "subtract-first-col":
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] -= raw_matrix[x, 0]

            elif method.startswith("normalize-by-column"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for y in range(ncols):
                    m = f(matrix[:, y])
                    if m != 0:
                        for x in range(nrows):
                            matrix[x, y] = matrix[x, y] / m

            elif method.startswith("normalize-by-row"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for x in range(nrows):
                    m = f(matrix[x, :])
                    if m != 0:
                        for y in range(ncols):
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "negzero2value":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] <= 0:
                            matrix[x, y] = options.value

            elif method == "minmax":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y], matrix[y, x] = \
                            min(matrix[x, y], matrix[y, x]), \
                            max(matrix[x, y], matrix[y, x])

            elif method == "log":
                # apply log to all values.
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log10(matrix[x, y])

            elif method == "ln":
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log(matrix[x, y])

            elif method == "transpose":
                matrix = numpy.transpose(matrix)
                row_headers, col_headers = col_headers, row_headers
                nrows, ncols = ncols, nrows

            elif method == "mul":
                matrix = numpy.dot(matrix, numpy.transpose(matrix))
                col_headers = row_headers

            elif method == "multiply-by-value":
                matrix *= options.value

            elif method == "divide-by-value":
                matrix /= options.value

            elif method == "add-value":
                matrix += options.value

            elif method == "angle":
                # write angles between col vectors
                v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0))
                matrix = numpy.dot(numpy.transpose(matrix), matrix)
                row_headers = col_headers
                nrows = ncols
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] /= v1[x] * v1[y]

            elif method == "euclid":
                # convert to euclidean distance matrix
                matrix = numpy.zeros((ncols, ncols), numpy.float)
                for c1 in range(0, ncols - 1):
                    for c2 in range(c1 + 1, ncols):
                        for r in range(0, nrows):
                            d = raw_matrix[r][c1] - raw_matrix[r][c2]
                            matrix[c1, c2] += (d * d)
                        matrix[c2, c1] = matrix[c1, c2]
                matrix = numpy.sqrt(matrix)
                row_headers = col_headers
                nrows = ncols

            elif method.startswith("symmetrize"):
                f = method.split("-")[1]
                if f == "max":
                    f = max
                elif f == "min":
                    f = min
                elif f == "mean":
                    f = lambda x, y: float(x + y) / 2

                if nrows != ncols:
                    raise ValueError(
                        "symmetrize only available for symmetric matrices")
                if row_headers != col_headers:
                    raise ValueError(
                        "symmetrize not available for permuted matrices")
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] = matrix[y,
                                              x] = f(matrix[x, y], matrix[y,
                                                                          x])
            elif method == "sub":
                matrix = options.value - matrix

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[parameter])
                new_value = float(options.parameters[parameter + 1])
                parameter += 2
                if method == "upper-bound":
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] > boundary:
                                matrix[x, y] = new_value
                else:
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] < boundary:
                                matrix[x, y] = new_value

            elif method == "subtract-matrix":
                matrix = matrix - other_matrix

            elif method == "add-matrix":
                matrix = matrix + other_matrix

            elif method == "normalize-by-matrix":

                # set 0s to 1 in the other matrix
                for x in range(nrows):
                    for y in range(ncols):
                        if other_matrix[x, y] == 0:
                            other_matrix[x, y] = 1.0

                matrix = matrix / other_matrix

            elif method == "mix-matrix":
                for x in range(len(other_row_headers) - 1):
                    for y in range(x + 1, len(other_col_headers)):
                        matrix[x, y] = other_matrix[x, y]

            elif method == "set-diagonal":
                value = float(options.parameters[parameter])
                for x in range(min(nrows, ncols)):
                    matrix[x, x] = value
                parameter += 1

            elif method == "transpose":
                matrix = numpy.transpose(raw_matrix)
                row_headers, col_headers = col_headers, row_headers

            elif method == "correspondence-analysis":
                row_indices, col_indices = CorrespondenceAnalysis.GetIndices(
                    raw_matrix)
                map_row_new2old = numpy.argsort(row_indices)
                map_col_new2old = numpy.argsort(col_indices)

                matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix(
                    raw_matrix,
                    map_row_new2old,
                    map_col_new2old,
                    row_headers=row_headers,
                    col_headers=col_headers)

            elif method == "mask-rows":
                r = set(row_names)
                for x in range(len(row_headers)):
                    if row_headers[x] in r:
                        matrix[x, :] = options.value

            elif method == "mask-columns":
                r = set(column_names)
                for x in range(len(col_headers)):
                    if col_headers[x] in r:
                        matrix[:, x] = options.value

            elif method == "mask-rows-and-columns":

                r = set(row_names)
                c = set(column_names)
                for x in range(len(row_headers)):
                    for y in range(len(col_headers)):
                        if row_headers[x] in r and col_headers[y] in c:
                            matrix[x, y] = options.value

            raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape)

        else:
            # for simple re-formatting jobs
            matrix = raw_matrix

        if options.write_separators:
            options.stdout.write(lines[chunks[chunk]])

        MatlabTools.writeMatrix(sys.stdout,
                                matrix,
                                value_format=options.format,
                                format=options.output_format,
                                row_headers=row_headers,
                                col_headers=col_headers)

    E.Stop()
コード例 #39
0
ファイル: psl2psl.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2psl.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--filter-query",
        dest="filename_filter_query",
        type="string",
        help=
        "filename with intervals in the query to filter (in gff format) [default=%default]."
    )

    parser.add_option(
        "--filter-target",
        dest="filename_filter_target",
        type="string",
        help=
        "filename with intervals in the target to filter (in gff format) [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("map", "merge", "add-sequence", "complement",
                               "select-query", "test", "filter-keep",
                               "filter-remove", "rename-query", "sanitize",
                               "filter-fasta", "remove-overlapping-query",
                               "remove-overlapping-target"),
                      help="""action to perform [default=%default].""")

    parser.add_option("--select",
                      dest="select",
                      type="choice",
                      choices=("most-nmatches", "least-nmatches",
                               "most-nmismatches", "least-nmismatches"),
                      help="entry to select [default=%default].")

    parser.add_option("--header",
                      dest="header",
                      type="choice",
                      choices=("none", "table", "full"),
                      help="output psl header [default=%default].")

    parser.add_option("--format",
                      dest="format",
                      type="choice",
                      choices=("gff", "gtf"),
                      help="format of intervals [default=%default].")

    parser.add_option("--filename-queries",
                      dest="filename_queries",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--filename-target",
                      dest="filename_sbjcts",
                      type="string",
                      help="fasta filename with sbjct [default=%default].")

    parser.add_option(
        "--id-format",
        dest="id_format",
        type="string",
        help=
        "format of new identifiers for the rename function [default=%default]."
    )

    parser.add_option(
        "--unique",
        dest="unique",
        action="store_true",
        help=
        "in the rename function, make each match unique [default=%default].")

    parser.add_option(
        "--output-filename-map",
        dest="output_filename_map",
        type="string",
        help=
        "filename with map of old to new labels for rename function [default=%default]."
    )

    parser.add_option(
        "--complement-min-length",
        dest="complement_min_length",
        type="int",
        help="minimum length for complemented blocks [default=%default].")

    parser.add_option(
        "--complement-border",
        dest="complement_border",
        type="int",
        help=
        "number of residues to exclude before alignment at either end [default=%default]."
    )

    parser.add_option(
        "--complement-aligner",
        dest="complement_aligner",
        type="choice",
        choices=("clustal", "dba", "dialign", "dialign-lgs"),
        help="aligner for complemented segments [default=%default].")

    parser.add_option(
        "--threshold-merge-distance",
        dest="threshold_merge_distance",
        type="int",
        help=
        "distance in nucleotides at which two adjacent reads shall be merged even if they are not overlapping [%default]."
    )

    parser.add_option(
        "--test",
        dest="test",
        type="int",
        help=
        "for debugging purposes - stop after x iterations [default=%default].")

    parser.set_defaults(filename_filter_target=None,
                        filename_filter_query=None,
                        filename_queries=None,
                        filename_sbjcts=None,
                        threshold_merge_distance=0,
                        report_step=100000,
                        min_aligned=100,
                        methods=[],
                        format="gff",
                        select="most-nmatches",
                        id_format="%06i",
                        unique=False,
                        output_filename_map=None,
                        header=None,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_queries:
        query_fasta = IndexedFasta.IndexedFasta(options.filename_queries)
    else:
        query_fasta = None

    if options.filename_sbjcts:
        sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts)
    else:
        sbjct_fasta = None

    if "add-sequence" in options.methods and (sbjct_fasta is None
                                              or query_fasta is None):
        raise ValueError(
            "please supply both indexed query and target/genome sequence data."
        )

    iterator = Blat.iterator(options.stdin)

    if options.header is not None or options.header != "none":
        if options.header == "table":
            options.stdout.write("\t".join(Blat.FIELDS) + "\n")
        elif options.header == "full":
            options.stdout.write(Blat.HEADER + "\n")

    for method in options.methods:

        if "map" == method:
            pslMap(options)
            break
        elif "filter-keep" == method:
            pslFilter(options, keep=True)
            break
        elif "filter-remove" == method:
            pslFilter(options, keep=False)
            break
        elif "merge" == method:
            pslMerge(options)
            break
        elif "add-sequence" == method:
            pslAddSequence(query_fasta, sbjct_fasta, options)
            break
        elif "complement" == method:
            pslComplement(query_fasta, sbjct_fasta, options)
            break
        elif "select-query" == method:
            pslSelectQuery(options)
            break
        elif "test" == method:
            iterator = Blat.iterator_test(iterator, options.report_step)
        elif "rename-query" == method:
            iterator = iterator_rename_query(iterator, options)
        elif "sanitize" == method:
            iterator = iterator_sanitize(iterator, query_fasta, sbjct_fasta,
                                         options)
        elif "filter-fasta" == method:
            iterator = iterator_filter_fasta(iterator, query_fasta,
                                             sbjct_fasta, options)
        elif "remove-overlapping-query" == method:
            iterator = iterator_filter_overlapping_query(iterator, options)
        elif "remove-overlapping-target" == method:
            iterator = iterator_filter_overlapping_target(iterator, options)

    for psl in iterator:
        options.stdout.write("%s\n" % str(psl))

    E.Stop()
コード例 #40
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--input-filename-queries",
        dest="input_filename_queries",
        type="string",
        help=
        "fasta filename with queries - required for polyA analysis [%default]."
    )

    parser.add_option("--polyA",
                      dest="polyA",
                      action="store_true",
                      help="detect polyA tails [%default].")

    parser.add_option(
        "-p",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help=
        "OUTPUT filename with histogram information on aggregate coverages [%default]."
    )

    parser.add_option(
        "--output-filename-empty",
        dest="output_filename_empty",
        type="string",
        help=
        "OUTPUT filename with queries for which all matches have been discarded [%default]."
    )

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("map", "psl"),
                      help="output format to choose [%default].")

    parser.add_option("-z",
                      "--from-zipped",
                      dest="from_zipped",
                      action="store_true",
                      help="input is zipped.")

    parser.add_option("--threshold-min-pid",
                      dest="threshold_min_pid",
                      type="float",
                      help="minimum thresholds for pid [%default].")

    parser.add_option(
        "--threshold-min-matches",
        dest="threshold_min_matches",
        type="int",
        help="minimum threshold for number of matching residues [%default].")

    parser.add_option(
        "--threshold-max-error-rate",
        dest="threshold_max_error_rate",
        type="float",
        help="maximum threshold for error of aligned part [%default].")

    parser.add_option(
        "--threshold-good-query-coverage",
        dest="threshold_good_query_coverage",
        type="float",
        help=
        "minimum query coverage for segments to be counted as good [%default]."
    )

    parser.add_option(
        "--threshold-min-query-coverage",
        dest="threshold_min_query_coverage",
        type="float",
        help="minimum query coverage for segments to be accepted [%default].")

    parser.add_option(
        "--threshold-max-query-gapchars",
        dest="threshold_max_query_gapchars",
        type="int",
        help="maximum number of gap characters  in query[%default].")

    parser.add_option("--threshold-max-query-gaps",
                      dest="threshold_max_query_gaps",
                      type="int",
                      help="maximum number of gaps  in query[%default].")

    parser.add_option(
        "--threshold-max-sbjct-gapchars",
        dest="threshold_max_sbjct_gapchars",
        type="int",
        help="maximum number of gap characters  in sbjct[%default].")

    parser.add_option("--keep-unique-matches",
                      dest="keep_unique_matches",
                      action="store_true",
                      help="ignore filters for unique matches [%default].")

    parser.add_option(
        "--keep-all-best",
        dest="keep_all_best",
        action="store_true",
        help=
        "when sorting matches, keep all matches within the collection threshold [%default]."
    )

    parser.add_option(
        "--best-per-sbjct",
        dest="best_per_sbjct",
        action="store_true",
        help=
        "keep only the best entry per sbjct (for transcript mapping) [%default]."
    )

    parser.add_option("--threshold-max-sbjct-gaps",
                      dest="threshold_max_sbjct_gaps",
                      type="int",
                      help="maximum number of gaps  in sbjct[%default].")

    parser.add_option("--test",
                      dest="test",
                      type="int",
                      help="test - stop after # rows of parsing[%default].")

    parser.add_option(
        "-m",
        "--matching-mode",
        dest="matching_mode",
        type="choice",
        choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage",
                 "best-pid", "best-covpid", "best-query-covpid",
                 "best-sbjct-covpid", "best-min-covpid",
                 "best-query-min-covpid", "best-sbjct-min-covpid", "unique",
                 "all"),
        help="determines how to selecte the best match [%default].")

    parser.add_option(
        "--filename-filter-sbjct",
        dest="filename_filter_sbjct",
        type="string",
        help=
        "gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default]."
    )

    parser.add_option(
        "--keep-forbidden",
        dest="keep_forbidden",
        action="store_true",
        help=
        "if set, keep only matches that overlap the regions supplied with --filename-filter-sbjct [%default]."
    )

    parser.add_option(
        "--query-forward-coordinates",
        dest="query_forward_coordinates",
        action="store_true",
        help=
        "use forward coordinates for query, strand will refer to sbjct [%default]."
    )

    parser.add_option(
        "--ignore-all-random",
        dest="ignore_all_random",
        action="store_true",
        help=
        "if there are multiple best matches, ignore all those to chrUn and _random [%default]."
    )

    parser.add_option(
        "--collection-threshold",
        dest="collection_threshold",
        type="float",
        help=
        "threshold for collecting matches, percent of best score [%default].")

    parser.add_option(
        "--collection-distance",
        dest="collection_distance",
        type="float",
        help=
        "threshold for collecting matches, difference to best score [%default]."
    )

    parser.set_defaults(
        input_filename_domains=None,
        input_filename_queries=None,
        threshold_good_query_coverage=90.0,
        threshold_min_pid=30.0,
        threshold_min_matches=0,
        threshold_max_error_rate=None,
        output_filename_pattern="%s",
        keep_unique_matches=False,
        output_format="map",
        print_matched=["full", "partial", "good"],
        from_zipped=False,
        combine_overlaps=True,
        min_length_domain=30,
        threshold_min_query_coverage=50,
        min_length_singletons=30,
        new_family_id=10000000,
        add_singletons=False,
        matching_mode="best-coverage",
        best_per_sbjct=False,
        threshold_max_query_gapchars=None,
        threshold_max_query_gaps=None,
        threshold_max_sbjct_gapchars=None,
        threshold_max_sbjct_gaps=None,
        filename_filter_sbjct=None,
        keep_forbidden=False,
        keep_all_best=False,
        test=None,
        query_forward_coordinates=False,
        output_filename_empty=None,
        collection_threshold=1.0,
        collection_distance=0,
        polyA=False,
        # max residues missing from non polyA end
        polyA_max_unaligned=3,
        # min residues in tail
        polyA_min_unaligned=10,
        # min percent residues that are A/T in tail
        polyA_min_percent=70.0,
        ## ignore duplicate matches if they are on Un or _random
        ignore_all_random=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) == 1:
        if options.from_zipped or args[0][-3:] == ".gz":
            import gzip
            infile = gzip.open(args[0], "r")
        else:
            infile = open(args[0], "r")
    else:
        infile = sys.stdin

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    if options.filename_filter_sbjct:

        try:
            import bx.intervals.io
            import bx.intervals.intersection
        except ImportError:
            raise "filtering for intervals requires the bx tools."

        intervals = GTF.readGFFFromFileAsIntervals(
            open(options.filename_filter_sbjct, "r"))

        intersectors = {}

        for contig, values in intervals.items():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in values:
                intersector.add_interval(bx.intervals.Interval(start, end))
            intersectors[contig] = intersector

        if options.loglevel >= 1:
            options.stdlog.write("# read %i intervals for %i contigs.\n" %\
                                 (sum( [ len(x) for x in intervals.values() ] ),
                                  len( intersectors ) ))
    else:
        intersectors = None

    ################################################
    ################################################
    ################################################
    ## processing of a chunk (matches of same query)
    ################################################
    ninput, noutput, nskipped = 0, 0, 0

    ## number of sequences with full/partial/good matches
    nfull_matches, npartial_matches, ngood_matches = 0, 0, 0
    ## number of sequences which are fully/good/partially matched
    ## i.e., after combining all aligned regions
    nfully_matched, npartially_matched, nwell_matched = 0, 0, 0

    nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0
    nremoved_gaps, nremoved_nmatches = 0, 0
    nremoved_regions = 0
    nqueries_removed_region = 0

    aggregate_coverages = []
    mapped_coverages = []
    fully_matched = []
    well_matched = []
    partially_matched = []
    new_family_id = options.new_family_id

    if options.output_filename_empty:
        outfile_empty = open(options.output_filename_empty, "w")
        outfile_empty.write("read_id\tcomment\n")
    else:
        outfile_empty = None

    if options.polyA:
        options.outfile_polyA = open(options.output_filename_pattern % "polyA",
                                     "w")
        options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n")

    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * math.power(options.threshold_max_error_rate,
                                       match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return

        ## Remove queries matching to a forbidden region. This section
        ## will remove the full query if any of its matches matches in a
        ## forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (
                        found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write("%s\toverlap with forbidden region\n" %
                                    query_id)
            return

        ## check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * (match.mQueryLength - sum(
            map(lambda x: x[1] - x[0], rest))) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        ## select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write( "%s\n" %\
                                              "\t".join( map(str, (
                                match.mQueryId, match.mSbjctId,
                                match.strand,
                                "%5.2f" % match.mQueryCoverage,
                                "%5.2f" % match.mSbjctCoverage,
                                "%5.2f" % match.mPid,
                                match.mQueryLength,
                                match.mSbjctLength,
                                match.mQueryFrom, match.mQueryTo,
                                match.mSbjctFrom, match.mSbjctTo,
                                ",".join( map(str,match.mBlockSizes) ),
                                ",".join( map(str,match.mQueryBlockStarts)),
                                ",".join( map(str,match.mSbjctBlockStarts)),
                                ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write("%s\tno matches selected: %s\n" %
                                    (query_id, msg))
            nempty += 1

    if options.output_format == "map":
        options.stdout.write("\t".join(
            ("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage",
             "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks",
             "qstarts", "sstarts")) + "\n")
    elif options.output_format == "psl":
        options.stdout.write(Blat.Match().getHeader() + "\n")

    ################################################
    ################################################
    ################################################
    ## main loop
    ################################################
    nfully_covered = None
    matches = []
    last_query_id = None
    is_complete = True
    ninput_lines = 0

    skip = 0

    iterator = Blat.BlatIterator(infile)

    while 1:

        try:
            match = iterator.next()
        except Blat.ParsingError:
            iterator = Blat.BlatIterator(infile)
            continue

        if match == None: break

        ninput_lines += 1

        if options.test and ninput_lines > options.test:
            break

        if match.mQueryId != last_query_id:
            if last_query_id:
                processChunk(last_query_id, matches)
            matches = []
            last_query_id = match.mQueryId

        matches.append(match)

    processChunk(last_query_id, matches)

    printHistogram(aggregate_coverages, "aggregate", options)

    printHistogram(mapped_coverages, "mapped", options)

    if "full" in options.print_matched:
        printMatched(fully_matched, "full", options)

    if "good" in options.print_matched:
        printMatched(well_matched, "good", options)

    if "partial" in options.print_matched:
        printMatched(partially_matched, "partial", options)

    if options.loglevel >= 1:
        options.stdlog.write("# alignments: ninput=%i, is_complete=%s\n" %
                             (ninput_lines, str(is_complete)))
        options.stdlog.write("# queries: ninput=%i, noutput=%i\n" %
                             (ninput, noutput))
        options.stdlog.write(
            "# individual coverage: full=%i, good=%i, partial=%i\n" %
            (nfull_matches, ngood_matches, npartial_matches))
        options.stdlog.write(
            "# aggregate  coverage: full=%i, good=%i, partial=%i\n" %
            (len(fully_matched), len(well_matched), len(partially_matched)))
        options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" %\
                             (nskipped+nqueries_removed_region+nempty,
                              nskipped, nqueries_removed_region, nempty))
        options.stdlog.write(
            "# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n"
            % (nremoved_pid, nremoved_query_coverage, nremoved_gaps,
               nremoved_regions, nremoved_nmatches))

    E.Stop()
コード例 #41
0
ファイル: psl2psl.py プロジェクト: lesheng/cgat
def pslMerge(options):
    """merge psl alignments.
    """

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0

    last_query = None
    last_target = None
    last_strand = None

    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(xrange(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(x, target,
                           {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(
                "query=%s, target=%s, strand=%s: removed overlapping/out-of-order segments: before=%i, after=%i"
                % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand,
                   len(matches), len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 10:
            options.stdlog.write("# input: %s\n" % (str(match)))

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        if match.mQueryId != last_query or match.strand != last_strand or match.mSbjctId != last_target:
            if last_query:
                noutput += process(matches)
            matches = []
            last_query, last_target, last_strand = match.mQueryId, match.mSbjctId, match.strand

        matches.append(match)

    if last_query:
        noutput += process(matches)

    E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" %
           (ninput, noutput, nskipped, ndiscarded))
コード例 #42
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-e", "--exclusive", dest="exclusive", action="store_true",
                      help="Intervals reported will be merged across the positive set"
                           " and do not overlap any interval in any of the other sets"
                           " [default=%default].")

    parser.add_option("-p", "--pattern-id", dest="pattern_id", type="string",
                      help="pattern to convert a filename to an id [default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("merged-combinations",
                               "unmerged-combinations"),
                      help = "method to perform [default=%default]")

    parser.set_defaults(
        pattern_id="(.*).bed.gz",
        exclusive=False,
        method="merged-combinations",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    tags, bedfiles = [], []
    for infile in args:
        bedfiles.append(pysam.Tabixfile(infile, "r"))
        tags.append(re.search(options.pattern_id, infile).groups()[0])

    indices = range(len(bedfiles))
    is_exclusive = options.exclusive

    if options.method == "merged-combinations":

        if is_exclusive:
            start = 1
        else:
            start = 2

        options.stdout.write("combination\twithout\tcounts\n")

        for ncombinants in range(start, len(bedfiles) + 1):
            for combination in itertools.combinations(indices, ncombinants):
                other = [x for x in indices if x not in combination]
                tag = ":".join([tags[x] for x in combination])
                E.debug("combination %s started" % tag)
                E.debug("other: %s" % ":".join([tags[x] for x in other]))
                other_bed = [bedfiles[x] for x in other]
                outf = IOTools.openFile(
                    E.getOutputFile(tag), "w", create_dir=True)
                c = E.Counter()
                for contig, start, end in combineMergedIntervals([bedfiles[x] for x in combination]):
                    c.found += 1
                    if is_exclusive and isContainedInOne(contig, start, end, other_bed):
                        c.removed += 1
                        continue
                    c.output += 1
                    outf.write("%s\t%i\t%i\n" % (contig, start, end))

                outf.close()
                E.info("combination %s finished: %s" % (tag, c))

                options.stdout.write("%s\t%s\t%i\n" % (
                    ":".join([tags[x] for x in combination]),
                    ":".join([tags[x] for x in other]),
                    c.output))

    elif options.method == "unmerged-combinations":
        options.stdout.write("track\tcombination\twithout\tcounts\n")

        for foreground in indices:

            start = 0

            background = [x for x in indices if x != foreground]
            for ncombinants in range(0, len(background) + 1):
                for combination in itertools.combinations(background, ncombinants):
                    other = [x for x in background if x not in combination]
                    combination_bed = [bedfiles[x] for x in combination]
                    other_bed = [bedfiles[x] for x in other]
                    tag = ":".join([tags[foreground]] + [tags[x]
                                   for x in combination])

                    E.debug("fg=%i, combination=%s, other=%s" %
                            (foreground, combination, other))
                    E.debug("combination %s started" % tag)
                    E.debug("other: %s" % ":".join([tags[x] for x in other]))

                    outf = IOTools.openFile(
                        E.getOutputFile(tag), "w", create_dir=True)
                    c = E.Counter()
                    for bed in combineUnmergedIntervals(
                            bedfiles[foreground],
                            combination_bed):
                        c.found += 1
                        if is_exclusive and isContainedInOne(bed.contig, bed.start, bed.end, other_bed):
                            c.removed += 1
                            continue
                        c.output += 1
                        outf.write("%s\n" % str(bed))

                    outf.close()
                    E.info("combination %s finished: %s" % (tag, c))

                    options.stdout.write("%s\t%s\t%s\t%i\n" % (
                        tags[foreground],
                        ":".join([tags[x] for x in combination]),
                        ":".join([tags[x] for x in other]),
                        c.output))

    E.Stop()
コード例 #43
0
ファイル: psl2psl.py プロジェクト: lesheng/cgat
            continue
        except StopIteration:
            break

        if not x:
            break

        ninput += 1

        if query_fasta:
            x.mQueryId = fq(x.mQueryId)
        if sbjct_fasta:
            x.mSbjctId = ft(x.mSbjctId)

        if ninput % options.report_step == 0:
            E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput))

        yield x

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))


def iterator_filter_fasta(infile, query_fasta, sbjct_fasta, options):

    ninput, noutput, nerrors = 0, 0, 0

    qmissing, smissing = collections.defaultdict(int), collections.defaultdict(
        int),

    while 1:
        try:
コード例 #44
0
ファイル: psl2psl.py プロジェクト: lesheng/cgat
    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(xrange(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(x, target,
                           {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(
                "query=%s, target=%s, strand=%s: removed overlapping/out-of-order segments: before=%i, after=%i"
                % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand,
                   len(matches), len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1
コード例 #45
0
ファイル: fastq2fastq.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--sample", dest="sample", type="float",
                      help="sample a proportion of reads [default=%default].")

    parser.add_option("--pair", dest="pair", type="string",
                      help="if data is paired, filename with second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--outfile-pair", dest="outfile_pair", type="string",
                      help="if data is paired, filename for second pair. "
                      "Implemented for sampling [default=%default].")

    parser.add_option("--uniq", dest="uniq", action="store_true",
                      help="remove duplicate reads (by name) [default=%default].")

    parser.add_option("--apply", dest="apply", type="string",
                      help="apply a filter to fastq file (taking only reads in filename) [default=%default].")

    parser.add_option("--trim3", dest="trim3", type="int",
                      help="trim # bases from 3' end [default=%default].")

    parser.add_option("--sort", dest="sort", action="store_true",
                      help="sort fastq by sequence id [default=%default].")

    parser.add_option("--seed", dest="seed", type="int",
                      help="seed for random number generator [default=%default].")

    parser.add_option("--renumber-ids", dest="renumber_ids", type="string",
                      help="rename reads in file by pattern [default=%default]")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        sample=None,
        trim3=None,
        pair=None,
        apply=None,
        uniq=False,
        outfile_pair=None,
        sort=None,
        seed=None,
        renumber_ids=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    if options.change_format:
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.change_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.sample:
        sample_threshold = min(1.0, options.sample)

        random.seed(options.seed)

        if options.pair:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")

            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if random.random() <= sample_threshold:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.apply:
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.trim3:
        trim3 = options.trim3
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.uniq:
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.sort:
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.outfile_pair:
                raise ValueError(
                    "please specify output filename for second pair (--outfile-pair)")
            E.warn(
                "consider sorting individual fastq files - this is memory intensive")
            entries1 = {}
            entries2 = {}
            for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.outfile_pair, "w")
            assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads
                                                                                                     need to reconcile files"""
            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.renumber_ids:
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_ids % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
コード例 #46
0
ファイル: psl2psl.py プロジェクト: lesheng/cgat
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    ninput, noutput, ndiscarded, nskipped, nskipped_small_queries = 0, 0, 0, 0, 0

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        ninput += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        # if no overlap: return
        if not qx or not tx:
            nskipped += 1
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                nskipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (map_query2target.mapRowToCol(
                qstart, alignlib_lite.py_RIGHT),
                              map_query2target.mapRowToCol(
                                  qend, alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if tstart >= mqend or tend <= mqstart:
                    continue
                if tend - tstart < min_length:
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)
                        E.debug(
                            "query: %i-%i (len=%i)-> %i-%i(len=%i); target: %i-%i (len=%i)-> %i-%i (len=%i)"
                            % (qstart, qend, qend - qstart, mqstart, mqend,
                               mqend - mqstart, tstart, tend, tend - tstart,
                               mtstart, mtend, mtend - mtstart))

                    alignlib_lite.py_copyAlignment(new, map_query2target,
                                                   qstart, qend, tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(tmp, map_query2target,
                                                       map_query,
                                                       alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query)))
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(tmp))
                                                 )
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(new, tmp, map_target,
                                                       alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write("# before: %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(tmp))
                                                 )
                            options.stdlog.write("# map   : %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write("# after : %s\n" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(new))
                                                 )
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug("input : %s" % str(
                            alignlib_lite.py_AlignmentFormatEmissions(
                                map_query2target)))
                        E.debug("final : %s" % str(
                            alignlib_lite.py_AlignmentFormatEmissions(new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1

    E.info(
        "map: ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i, nsmall_queries=%i"
        % (ninput, noutput, nskipped, ndiscarded, nskipped_small_queries))
コード例 #47
0
ファイル: tree_diff.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: tree_diff.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-1",
                      "--filename-tree1",
                      dest="filename_tree1",
                      type="string",
                      help="filename with first tree(s).")
    parser.add_option("-2",
                      "--filename-tree2",
                      dest="filename_tree2",
                      type="string",
                      help="filename with second tree(s).")
    parser.add_option("-o",
                      "--outgroup",
                      dest="outgroup",
                      type="string",
                      help="reroot with outgroup before processing.")

    parser.set_defaults(filename_tree1=None,
                        filename_tree2=None,
                        outgroup=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if (len(args) == 2):
        options.filename_tree1, options.filename_tree2 = args

    if not options.filename_tree1 or not options.filename_tree2:
        raise ValueError("please specify two trees.")

    ## take first trees
    nexus = TreeTools.Newick2Nexus(open(options.filename_tree1, "r"))
    trees1 = nexus.trees
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from %s.\n" %
                             (len(trees1), options.filename_tree1))

    ## take first trees
    nexus = TreeTools.Newick2Nexus(open(options.filename_tree2, "r"))
    trees2 = nexus.trees
    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees from %s.\n" %
                             (len(trees2), options.filename_tree2))

    ntotal, nsame, ndiff = 0, 0, 0

    if options.outgroup:
        for tree in trees1:
            tree.root_with_outgroup(options.outgroup)
        for tree in trees2:
            tree.root_with_outgroup(options.outgroup)

    for x in range(len(trees1)):
        for y in range(len(trees2)):
            if options.loglevel >= 2:
                print trees1[x]
                print trees2[y]
            if trees1[x].is_identical(trees2[y]):
                code = "="
                nsame += 1
            else:
                code = "<>"
                ndiff += 1
            options.stdout.write("%s\t%i\t%i\n" % (code, x, y))
            ntotal += 1

    options.stdlog.write("# n1=%i, n2=%i, ntotal=%i, nsame=%i, ndiff=%i\n" %
                         (len(trees1), len(trees2), ntotal, nsame, ndiff))

    E.Stop()
コード例 #48
0
ファイル: mali2rates.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-s",
        "--sites",
        dest="sites",
        type="string",
        help="sites to use [default=%default].",
    )

    parser.add_option(
        "-f",
        "--file",
        dest="filename",
        type="string",
        help="filename of multiple alignment (- for stdin) [default=%default].",
        metavar="FILE")

    parser.add_option("-o",
                      "--format",
                      dest="format",
                      type="string",
                      help="format [default=%default].",
                      metavar="format")

    parser.add_option(
        "-d",
        "--distance",
        dest="distance",
        type="choice",
        choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81",
                 "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT",
                 "PMB", "PAM", "Kimura", "CategoriesModel"),
        help="method to use for distance calculation [default=%default].")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("phylip", "baseml", "own", "xrate"),
                      help="program to use for rate calculation.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("list", "tree"),
                      help="output format.")

    parser.add_option(
        "-m",
        "--min-sites",
        dest="min_sites",
        type="int",
        help="minimum number of sites for output[default=%default].",
    )

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na", "auto"),
        help="alphabet to use.",
    )

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree information.")

    parser.add_option("--set-alpha",
                      dest="alpha",
                      type="float",
                      help="initial alpha value.")

    parser.add_option("--fix-alpha",
                      dest="fix_alpha",
                      action="store_true",
                      help="do not estimate alpha.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump output.")

    parser.add_option("--test",
                      dest="test",
                      action="store_true",
                      help="test run - does not clean up.")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions.")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input", "trained", "all"),
                      help="output sections to write for xrate.")

    parser.add_option("--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files.")

    parser.add_option("--xrate-min-increment",
                      dest="xrate_min_increment",
                      type=float,
                      help="minimum increment to stop iteration in xrate.")

    parser.set_defaults( \
        input_format = "fasta",
        filename_tree = None,
        with_counts = False,
        sites = "d4",
        distance = "T92",
        min_sites = 1,
        filename = "-",
        alphabet="auto",
        format= "%6.4f",
        method="phylip",
        kappa = None,
        fix_kappa = False,
        alpha = None,
        fix_alpha = False,
        dump = False,
        clean_data = None,
        output_format = "list",
        iteration="all-vs-all",
        pairwise=False,
        report_step = 1000,
        output_pattern = "%s.eg",
        write = [],
        test_xrate = False,
        xrate_min_increment = None,
        is_codons = False,
        )

    (options, args) = E.Start(parser)

    if options.filename != "-":
        infile = open(options.filename, "r")
    else:
        infile = sys.stdin

    ## read multiple alignment
    if options.pairwise:
        ## read sequences, but not as a multiple alignment. This permits multiple names.
        mali = Mali.SequenceCollection()
        options.iteration = "pairwise"
    else:
        mali = Mali.Mali()

    mali.readFromFile(infile, format=options.input_format)

    ids = mali.getIdentifiers()

    if options.alphabet == "auto":
        s = "".join(map(lambda x: x.mString, mali.values())).lower()
        ss = re.sub("[acgtxn]", "", s)
        if float(len(ss)) < (len(s) * 0.1):
            options.alphabet = "na"
            if mali.getNumColumns() % 3 == 0:
                options.is_codons = True
        else:
            options.alphabet = "aa"

        if options.loglevel >= 1:
            options.stdlog.write("# autodetected alphabet: %s\n" %
                                 options.alphabet)

    if options.filename != "-":
        infile.close()

    npairs = 0
    nskipped_length = 0
    nskipped_distance = 0

    pairs = []
    if options.iteration == "all-vs-all":
        for x in range(len(ids) - 1):
            for y in range(x + 1, len(ids)):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))

    if options.alphabet == "na":

        if options.method == "baseml":
            runBaseML(mali, pairs, options)
        elif options.method == "phylip" and options.distance in ("F84", "K80",
                                                                 "JC69",
                                                                 "LogDet"):
            runDNADIST(mali, pairs, options)
        elif options.method == "xrate":
            runXrate(mali, pairs, options)
        else:
            if options.is_codons:
                h = Genomics.SequencePairInfoCodons().getHeader()
            else:
                h = Genomics.SequencePairInfo().getHeader()
            options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h))

            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                info = Genomics.CalculatePairIndices(
                    mali[id_x], mali[id_y], with_codons=options.is_codons)

                if options.distance in ("T92", "JC69"):
                    if options.sites == "d4":
                        seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x],
                                                                 mali[id_y],
                                                                 position=3,
                                                                 degeneracy=4)

                        if len(seq1) < options.min_sites:
                            nskipped_length += 1
                            continue
                    else:
                        raise "unknown sites %s" % options.sites

                if options.distance == "T92":
                    distance, variance = CalculateDistanceT92(info)
                elif options.distance == "JC69":
                    distance, variance = CalculateDistanceJC69(info)
                elif options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        map(str, (id_x, id_y, options.format % distance,
                                  options.format % variance, info))) + "\n")
                else:
                    nskipped_distance += 1

    elif options.alphabet == "aa":

        if options.distance in ("JTT", "PMB", "PAM", "Kimura",
                                "CategoriesModel"):

            # use phylip for these
            phylip = WrapperPhylip.Phylip()
            phylip.setProgram("protdist")
            phylip.setMali(mali)

            phylip_options = []
            if options.distance == "PMG":
                phylip_options += ["D"] * 1
            elif options.distance == "PAM":
                phylip_options += ["D"] * 2
            elif options.distance == "Kimura":
                phylip_options += ["D"] * 3
            elif options.distance == "CategoriesModel":
                phylip_options += ["D"] * 4

            phylip_options.append("Y")
            phylip.setOptions(phylip_options)
            result = phylip.run()

            writePhylipResult(result, options)

        else:
            options.stdout.write("id1\tid2\tdist\tvar\n")

            ## iterate over all pairs of sequences
            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                if options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    ## percentage overlap
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        (id_x, id_y, options.format % distance,
                         options.format % variance)) + "\n")
                else:
                    nskipped_distance += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n"
            % (len(ids), npairs, nskipped_length, nskipped_distance))

    E.Stop()
コード例 #49
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [kl=kullback-leibler]",
                      choices=("kl", ))
    parser.add_option("-n",
                      "--no-normalize",
                      dest="normalize",
                      action="store_false",
                      help="do not normalize data")
    parser.add_option("-p",
                      "--pseudocounts",
                      dest="pseudocounts",
                      type="int",
                      help="pseudocounts to add.")
    parser.add_option("-f",
                      "--number-format",
                      dest="number_format",
                      type="string",
                      help="number format.")

    parser.set_defaults(method="kl",
                        columns="all",
                        headers=True,
                        xrange=None,
                        pseudocounts=1,
                        normalize=True,
                        number_format="%6.4f")

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    data, legend = IOTools.readTable(sys.stdin,
                                     numeric_type=numpy.float32,
                                     take=options.columns,
                                     headers=options.headers,
                                     truncate=options.xrange)

    nrows, ncols = data.shape

    # first: normalize rows
    for y in range(1, ncols):
        for x in range(nrows):
            data[x, y] = data[x, y] + float(options.pseudocounts)
        if options.normalize:
            t = numpy.sum(data[:, y])
            for x in range(nrows):
                data[x, y] = data[x, y] / t

    for x in range(1, len(legend) - 1):
        for y in range(x + 1, len(legend)):

            if options.method == "kl":
                d1 = 0.0
                d2 = 0.0
                for bin in range(nrows):
                    p = data[bin, x]
                    q = data[bin, y]
                    d1 += p * math.log(p / q)
                    d2 += q * math.log(q / p)

                options.stdout.write(
                    "%s\t%s\t%s\n" %
                    (legend[x], legend[y], options.number_format % d1))
                options.stdout.write(
                    "%s\t%s\t%s\n" %
                    (legend[y], legend[x], options.number_format % d2))

    E.Stop()
コード例 #50
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--species-map",
        dest="species_map",
        type="string",
        help="text file specifying the mapping between contig and genome")

    parser.add_option(
        "-g",
        "--genome-dir",
        dest="genome_dir",
        type="string",
        help="specify directory where genome / genomes are stored")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contig lengths into dictionary
    E.info("reading contigs file")
    c_contigs = 0
    contigs_lengths = {}
    for fasta in FastaIterator.iterate(options.stdin):
        c_contigs += 1

        # titles of fasta records must be single strings with no special
        # characters
        contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence)

    E.info("read %i contigs" % c_contigs)

    # read in mapping between spcies and contigs
    species_map = {}
    for line in open(options.species_map).readlines():
        data = line[:-1].split("\t")
        contig, species = data[0], data[1]
        species_map[contig] = species

    # read genomes into memory
    # NB this may need optimisin if using large
    # genomes or many genomes
    E.info("reading genomes from %s" % options.genome_dir)

    # The directory must ONLY contain genome files!!
    genomes_sequences = {}
    c_genomes = 0
    for genome_file in glob.glob(os.path.join(options.genome_dir, "*")):
        c_genomes += 1
        for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)):
            genomes_sequences[fasta.title] = fasta.sequence
    E.info("read %i genomes from %s" % (c_genomes, options.genome_dir))

    # iterate over the contigs and sample from the respective genome
    E.info("iterating over contigs")
    c_contigs_output = 0
    for contig, length in contigs_lengths.iteritems():
        if contig not in species_map:
            E.warn("contig %s not in species map file" % contig)
        else:
            c_contigs_output += 1
            genome = species_map[contig]
            genome_length = len(genomes_sequences[genome])

            # get the start position from which to sample
            start = random.randint(1, genome_length)
            try:
                end = start + length - 1
            except ValueError:
                print "end of sampled contig extends beyond length of genome"

            sampled_seq = genomes_sequences[genome][start:end]
            options.stdout.write(
                ">%s_random\n%s\n" %
                (contig + "_%s" % species_map[contig], sampled_seq))

    E.info("written %i contigs" % c_contigs_output)
    # write footer and output benchmark information.
    E.Stop()
コード例 #51
0
ファイル: bams2bam.py プロジェクト: zpeng1989/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.Samfile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(
            IOTools.openFile(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.iteritems()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.Samfile(options.filename_transcriptome,
                                            "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.Samfile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.Samfile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.Samfile(options.filename_mismapped,
                                         "wb",
                                         template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.Samfile(options.filename_junctions,
                                          "rb")
    else:
        junctions_samfile = None

    c = _bams2bam.filter(genome_samfile,
                         output_samfile,
                         output_mismapped,
                         transcripts_samfile,
                         junctions_samfile,
                         transcripts,
                         regions=regions_to_remove,
                         unique=options.unique,
                         remove_contigs=options.remove_contigs,
                         colour_mismatches=options.colour_mismatches,
                         ignore_mismatches=options.ignore_mismatches,
                         ignore_transcripts=transcripts_samfile is None,
                         ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.openFile(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.Stop()
コード例 #52
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2fasta.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--filename-query",
                      dest="filename_query",
                      type="string",
                      help="fasta filename with queries.")

    parser.add_option("--filename-target",
                      dest="filename_target",
                      type="string",
                      help="fasta filename with target.")

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("full", "pileup-query", "pileup-target", "gapless"),
        help="method to use for constructing the alignment [%default].")

    parser.add_option(
        "--forward-query",
        dest="forward_query",
        action="store_true",
        help=
        "reverse-complement sequences such that query is always on forward strand [%default]"
    )

    parser.add_option("--target-prefix",
                      dest="target_prefix",
                      type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix",
                      dest="query_prefix",
                      type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id",
                      dest="id",
                      type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts),
                        -min(match.mSbjctBlockStarts))
        q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom,
                              match.mQueryTo)
        t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom,
                               match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
                                 (options.query_prefix,
                                  options.output_format_id % id,
                                  match.mQueryId, match.mQueryFrom, match.mQueryTo,
                                  query_ali,
                                  options.target_prefix,
                                  options.output_format_id % id,
                                  match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
                                  sbjct_ali ) )
        id += 1

    E.Stop()
コード例 #53
0
ファイル: gff2view.py プロジェクト: zpeng1989/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2view.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-t", "--target", dest="target", type="choice",
                      choices=("ucsc", "gbrowser"),
                      help="target location to open [%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("-f", "--add-flank", dest="flank", type="int",
                      help="add # nucleotides for each region.")

    parser.add_option("-z", "--zoom", dest="zoom", type="float",
                      help="zoom out (# > 1) or in (# < 1).")

    parser.add_option("-c", "--chunk-size", dest="chunk_size", type="int",
                      help="number of tabs to display in one go.")

    parser.add_option("--ucsc-assembly", dest="ucsc_assembly", type="string",
                      help="ucsc assembly.")

    parser.add_option("--ucsc-user-tracks", dest="ucsc_user_tracks", type="string",
                      help="ucsc user tracks.")

    parser.add_option("--gbrowser-assembly", dest="gbrowser_assembly", type="string",
                      help="gbrowser assembly.")

    parser.add_option("--randomize", dest="randomize", action="store_true",
                      help="randomize input [%default]")

    parser.set_defaults(
        ucsc_assembly="ponAbe2",
        ucsc_url="http://genome.ucsc.edu/cgi-bin/hgTracks",
        gbrowser_assembly="Songbird",
        gbrowser_url="http://genserv.anat.ox.ac.uk/cgi-bin/devel/gbrowse",
        genome_file=None,
        ucsc_custom_annotation="http://wwwfgu.anat.ox.ac.uk/~andreas/ucsc_tracks/%s",
        ucsc_user_tracks=None,
        flank=None,
        zoom=None,
        chunk_size=50,
        is_gtf=False,
        target="ucsc",
        randomize=False,
        joined=False,
    )

    (options, args) = E.Start(parser)

    if len(args) != 1:
        print USAGE
        raise "please specify the gff file to open."

    if options.is_gtf:
        entry_iterator = GTF.iterator
        chunk_iterator = GTF.flat_gene_iterator
    else:
        entry_iterator = GTF.iterator
        if options.joined:
            chunk_iterator = GTF.joined_iterator
        else:
            chunk_iterator = GTF.chunk_iterator

    if len(args) == "0" or args[0] == "-":
        iterator = chunk_iterator(entry_iterator(sys.stdin))
    else:
        iterator = chunk_iterator(entry_iterator(open(args[0], "r")))

    nopened = 0

    # b = webbrowser.get( "konqueror" )
    b = webbrowser.get("firefox")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.ucsc_user_tracks:
        annotations = "hgt.customText=%s" % (
            options.ucsc_custom_annotation % options.ucsc_user_tracks)
    else:
        annotations = None

    for chunk in iterator:
        start = min([x.start for x in chunk])
        end = max([x.end for x in chunk])

        if options.flank:
            start -= options.flank
            end += options.flank

        if options.zoom:
            s = end - start
            d = options.zoom * s - s
            start -= d
            end += d

        start = max(0, start)

        contig = chunk[0].contig

        if fasta:
            contig = fasta.getToken(contig)
            end = min(end, fasta.getLength(contig))

        if len(contig) < 3:
            contig = "chr%s" % contig

        if options.target == "ucsc":
            url_options = ["db=%s" % options.ucsc_assembly,
                           "position=%s:%i-%i" % (contig, start, end)]

            if annotations:
                url_options.append(annotations)

            url = "%s?%s" % (options.ucsc_url,
                             "&".join(url_options))
        elif options.target == "gbrowser":
            url = "%s/%s?name=%s:%i..%i" % (options.gbrowser_url,
                                            options.gbrowser_assembly,
                                            contig,
                                            start,
                                            end)

        print "# opening browser window for:"
        print "#", url

        if nopened % options.chunk_size == 0:
            if nopened != 0:
                x = raw_input(
                    'showing %i - hit return to continue:' % options.chunk_size)
            b.open_new(url)
            first = False
        else:
            b.open_new_tab(url)

        nopened += 1

    E.Stop()
コード例 #54
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="master sequence.")

    parser.add_option("-p",
                      "--master-pattern",
                      dest="master_pattern",
                      type="string",
                      help="master pattern.")

    parser.add_option("--master-species",
                      dest="master_species",
                      type="string",
                      help="species to use as master sequences.")

    parser.add_option("-t",
                      "--translate",
                      dest="filename_translation",
                      type="string",
                      help="filename on where to store translated sequences.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-c",
                      "--mark-codons",
                      dest="mark_codons",
                      action="store_true",
                      help="mark codons.")

    parser.add_option(
        "-i",
        "--ignore-case",
        dest="ignore_case",
        action="store_true",
        help="ignore case (otherwise: lowercase are unaligned chars).")

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons.")

    parser.add_option("--mask-stops",
                      dest="mask_stops",
                      action="store_true",
                      help="mask stop codons.")

    parser.add_option("--mask-char",
                      dest="mask_char",
                      type="string",
                      help="masking character to use.")

    parser.add_option("-f",
                      "--remove-frameshifts",
                      dest="remove_frameshifts",
                      action="store_true",
                      help="remove columns corresponding to frameshifts.")

    parser.add_option(
        "--mask-master",
        dest="mask_master",
        action="store_true",
        help=
        "columns in master to be removed are masked to keep residue numbering."
    )

    parser.add_option(
        "-s",
        "--split-exons",
        dest="split_exons",
        action="store_true",
        help="split columns aligned to different exons in the same gene.")

    parser.add_option("-a",
                      "--target",
                      dest="target",
                      type="choice",
                      choices=("paml", ),
                      help="perform cleaning up for certain targets.")

    parser.set_defaults(
        gap_char="-",
        mask_char="n",
        gap_chars="-.",
        separator="|",
        master=None,
        master_species=None,
        filename_translation=None,
        filename_exons=None,
        master_pattern=None,
        remove_stops=False,
        mark_codons=False,
        mask_unaligned=False,
        split_exons=False,
        remove_frameshifts=False,
        min_segment_length=5,
        ignore_case=False,
        mask_stops=False,
        target=None,
        mask_master=False,
    )

    (options, args) = E.Start(parser)

    if options.target == "paml":
        options.mask_stops = True
        options.mask_char = "n"
        options.remove_frameshifts = True

        if options.loglevel >= 1:
            options.stdlog.write(
                "# setting output to paml : removing frameshifts, masking stops with '%s'.\n"
                % (options.mask_char))

    # 1. read multiple alignment in fasta format
    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read mali with %i entries.\n" % len(mali))

    if len(mali) == 0:
        raise "empty multiple alignment"

    identifiers = mali.getIdentifiers()

    masters = []
    if options.master:
        masters = options.master.split(",")
    elif options.master_pattern:
        for id in identifiers:
            if re.search(options.master_pattern, id):
                masters.append(id)
    elif options.master_species:
        for id in identifiers:
            if options.master_species == id.split(options.separator)[0]:
                masters.append(id)
    else:
        masters.append(identifiers[0])

    if options.loglevel >= 2:
        options.stdlog.write("# master sequences are: %s\n" % str(masters))
        options.stdlog.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                         filter=set(identifiers),
                                         from_zero=True)

        if options.loglevel >= 2:
            options.stdlog.write("# read exons %i sequences.\n" % len(exons))
    else:
        exons = {}

    ##########################################################################
    ##########################################################################
    ##########################################################################
    # translate characters to upper/lower case according to exon info.
    ##########################################################################
    if exons:
        for id in identifiers:
            if id in exons:
                mali.getSequence(id).mString = AddExonInformation(
                    mali[id], exons[id], mask_char=options.mask_char)

    elif options.ignore_case:
        # convert all to uppercase
        mali.upper()

    ##########################################################################
    ##########################################################################
    ##########################################################################
    # untangle misaligned exons
    ##########################################################################
    if exons and options.split_exons:

        # first split with masters
        if len(masters) > 0:
            SplitExons(mali, exons, masters=masters, options=options)

            if options.loglevel >= 4:
                mali.writeToFile(open("log_mali1", "w"), format="fasta")

        SplitExons(mali, exons, options)

    ##########################################################################
    ##########################################################################
    ##########################################################################
    # remove frameshifts
    ##########################################################################
    if options.remove_frameshifts:
        out_of_frame_columns = []
        if len(masters) == 1:

            frame_columns = GetFrameColumns(mali,
                                            masters[0],
                                            gap_chars=options.gap_chars)

        else:

            columns = []

            for id in masters:
                columns += GetFrameColumns(mali,
                                           id,
                                           gap_chars=options.gap_chars)

            if len(columns) == 0:
                columns += GetFrameColumns(mali,
                                           identifiers[0],
                                           gap_chars=options.gap_chars)

            # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100),
            # and (1,2,100) before (1,3,4).
            columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2])))

            # select codons
            frame_columns = []
            last_codon = columns[0]

            for codon in columns[1:]:
                # skip identical codons
                if codon == last_codon:
                    continue

                # take first (shortest) codon in case of identical first
                # residue
                if codon[0] == last_codon[0]:
                    continue

                # if not overlapping, keep
                if codon[0] > last_codon[2]:
                    frame_columns.append(last_codon)
                else:
                    out_of_frame_columns += last_codon

                # if overlapping, but out of register: skip
                last_codon = codon

            frame_columns.append(last_codon)

        # build set of skipped columns
        frame_set = set()
        for column in frame_columns:
            for c in column:
                frame_set.add(c)

        # columns that contain a master sequence that is out of
        # frame
        out_of_frame_set = set(out_of_frame_columns)
        out_of_frame_set = out_of_frame_set.difference(frame_set)

        if options.loglevel >= 1:
            options.stdlog.write("# found %i/%i columns in frame\n" %
                                 (len(frame_columns) * 3, mali.getWidth()))

            if options.loglevel >= 5:
                options.stdlog.write("# frame columns: %i\n" %
                                     (len(frame_columns)))
                x = 0
                for column in frame_columns:
                    options.stdlog.write("# %i\t%s\n" %
                                         (x, ",".join(map(str, column))))
                    x += 1

            if options.loglevel >= 5:
                options.stdlog.write(
                    "# Out-of frame columns with residue of masters: %i\n" %
                    (len(out_of_frame_set)))
                options.stdlog.write("# %s" %
                                     ",".join(map(str, out_of_frame_columns)))

        mask_chars = (string.upper(options.mask_char),
                      string.lower(options.mask_char))

        to_delete = []

        ignore_case = exons or options.ignore_case

        for id in identifiers:

            ngaps, nmasked = 0, 0

            sequence = mali.getSequence(id).mString

            if options.loglevel >= 7:
                options.stdlog.write(
                    "# processing sequence %s of length %i with gaps\n" %
                    (id, len(sequence)))

            # treat masters differently if they are only to be masked, not
            # pruned.
            # simple mask all characters that are to skipped
            fragments = []
            nstops, ncodons, naligned = 0, 0, 0

            codon = []
            chars = []

            is_master = id in masters

            for x in range(len(sequence)):
                c = sequence[x]

                # delete columns that do not align to
                # a master.
                if x not in frame_set and x not in out_of_frame_set:
                    continue

                chars.append(c)
                if c not in options.gap_chars:
                    codon.append(c)
                if len(codon) % 3 == 0:
                    codon = "".join(codon)
                    codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon(
                        codon, options)

                    if codon_is_aligned:
                        naligned += 1

                    to_mask = False
                    if codon_is_all_gaps:
                        ngaps += len(chars)
                    elif codon_is_ok:
                        ncodons += 1
                        if string.upper(codon) in ("TAG", "TAA", "TGA"):
                            nstops += 1
                            to_mask = True
                    else:
                        to_mask = True
                        nmasked += 1

                    if to_mask:
                        for i in range(len(chars)):
                            if chars[i] not in options.gap_chars:
                                chars[i] = options.mask_char

                    fragments.append("".join(chars))
                    chars = []
                    codon = []

            # mask incomplete codons at the end
            if chars:
                for i in range(len(chars)):
                    if chars[i] not in options.gap_chars:
                        chars[i] = options.mask_char
                fragments.append("".join(chars))


# else:

# for a,b,c in frame_columns:

##                     codon = sequence[a] + sequence[b] + sequence[c]

##                     codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options )

##                     if codon_is_aligned: naligned += 1

# if codon_is_all_gaps:
##                         fragments.append( options.gap_char * 3 )
##                         ngaps += 1
# elif codon_is_ok:
##                         ncodons += 1
# if string.upper(codon) in ("TAG", "TAA", "TGA"):
# if options.remove_stops:
##                                 fragments.append( options.gap_char * 3 )
# elif options.mask_stops:
##                                 fragments.append( options.mask_char * 3 )
# else:
##                                 fragments.append( codon )
##                             nstops += 1
# else:
##                             fragments.append( codon )
# else:
##                         fragments.append( options.gap_char * 3 )
##                         nmasked += 1

# if options.loglevel >= 7:
# options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id,
# a,b,c,
# codon,
# str(codon_is_ok),
# str(codon_is_aligned) ))

            s = string.join(fragments, "")
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n"
                    % (id, len(fragments), naligned, ncodons, nstops, ngaps,
                       nmasked))
                options.stdlog.flush()

            # postpone deletion in order to not
            # confuse the iteration of ids
            if naligned == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned nucleotides.\n"
                    % id)
                to_delete.append(id)
            elif ncodons == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned codons.\n"
                    % id)
                to_delete.append(id)
            else:
                mali.setSequence(id, string.join(fragments, ""))

        for id in to_delete:
            del mali[id]

    for id in identifiers:
        if options.mark_codons:
            a = mali[id]
            f = lambda x: a[x:x + 3]
            s = string.join([f(x) for x in range(0, len(a), 3)], " ")
        else:
            s = mali[id]
        options.stdout.write(">%s\n%s\n" % (id, s))

    if options.filename_translation:
        outfile = open(options.filename_translation, "w")
        for id in mali.keys():
            outfile.write(">%s\n%s\n" %
                          (id, Genomics.TranslateDNA2Protein(mali[id])))
        outfile.close()

    E.Stop()
コード例 #55
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="supply input bam file name")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtf_file",
                      type="string",
                      help="supply input gtf file name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="supply output file name")

    parser.add_option(
        "-G",
        "--reference-GTF",
        dest="reference_gtf",
        type="string",
        help=
        "supply reference gtf for context of reads not contributing to transcripts"
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ######################################################
    ######################################################
    # for all alignments
    ######################################################
    ######################################################

    # open outfile and prepare headers
    outf = open(options.outfile, "w")
    outf.write("\t".join([
        "total alignments", "aligments in transcripts",
        "percent alignments in transcripts", "total spliced alignments",
        "spliced alignments in transcripts",
        "percent spliced alignments in transcripts"
    ]) + "\n")

    # calculate coverage over transcript file - NB split reads contribute twice to the transcript
    # use BedTool object
    pybedbamfile = pybedtools.BedTool(options.bam_file)

    # count alignments
    E.info("counting total number of alignments and spliced alignments")
    total_alignments = 0
    spliced_alignments = 0

    for alignment in pybedbamfile:
        cigar = alignment[5]
        if cigar.find("N") != -1:  # N signifies split read
            total_alignments += 1
            spliced_alignments += 1
        else:
            total_alignments += 1

    # merge the gtf file to avoid double counting of exons in different transcripts - converts to a bed file
    gtffile = pybedtools.BedTool(options.gtf_file).merge()

    E.info("computing coverage of aligments in %s over intervals in %s" %
           (options.bam_file, options.gtf_file))
    cover = pybedbamfile.coverage(gtffile)

    # make sure that the exons aren't being counted twice - shouldn't be because of merge
    E.info("counting reads contributing to transcripts")
    c = 0
    for entry in cover:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    # sum the coverage across exons from all transcripts
    coverage_in_transcripts = c

    ######################################################
    ######################################################
    # for spliced alignments
    ######################################################
    ######################################################

    # count total number of spliced alignments
    # requires that the CIGAR string 'N' is present

    # uses pysam to write out a bam file of the spliced reads only
    allreads = pysam.Samfile(options.bam_file)
    spliced_bamname = P.snip(options.bam_file, ".bam") + "_spliced_reads.bam"

    # open file for outputting spliced alignments
    splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads)

    # cigar string in pysam for spliced alignment is (3, int)
    spliced = collections.defaultdict(list)
    for read in allreads:
        for cigar_tag in read.cigar:
            if cigar_tag[0] == 3:
                spliced[read].append(cigar_tag)

    # write out spliced alignments
    for read in spliced.keys():
        splicedreads.write(read)
    splicedreads.close()
    allreads.close()

    # index splice reads bam file
    pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam"))
    pysam.index(spliced_bamname)

    # read in the spliced reads as a BedTool object
    splicedbam = pybedtools.BedTool(spliced_bamname)

    # perform coverage of spliced reads over intervals - will be twice as many as there should be
    # due to counting both exons overlapping
    spliced_coverage = splicedbam.coverage(gtffile)

    # avoid double counting exons
    E.info("counting spliced reads contributing to transcripts")
    spliced_exons = {}
    c = 0
    for entry in spliced_coverage:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    spliced_coverage_in_transcripts = c

    # NOTE: the counting of spliced alignments is not accurate

    spliced_coverage_in_transcripts = float(
        spliced_coverage_in_transcripts) / 2

    ###########################
    ## write out the results ##
    ###########################

    outf.write(str(int(total_alignments)) + "\t")
    # remove half of the coverage assigned to spliced reads
    coverage_in_transcripts = (coverage_in_transcripts) - (
        spliced_coverage_in_transcripts)
    outf.write(
        str(
            int(coverage_in_transcripts) -
            int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t")

    # write out spliced counts
    outf.write(str(int(spliced_alignments)) + "\t")
    outf.write(str(int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100)))

    outf.close()

    ############################
    # contextualise those that
    # don't fall in transcripts
    ############################

    if options.reference_gtf:
        context_summary = open(
            P.snip(options.bam_file, ".bam") + ".excluded.context", "w")
        context_summary.write("\t".join(["Feature", "number"]) + "\n")

        # write out the read info as well
        context_file = open(
            P.snip(options.bam_file, ".bam") + ".excluded", "w")

        context_dict = collections.defaultdict(int)
        # intersect bam - write non-overlapping with transcripts - intersect with reference - write out
        context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect(
            pybedtools.BedTool(options.reference_gtf), wb=True)
        for entry in context:
            feature = entry[8]
            context_dict[feature] += 1
            context_file.write("\t".join([e for e in entry]) + "\n")

        for feature, value in context_dict.iteritems():
            context_summary.write("\t".join([feature, str(value)]) + "\n")

        context_file.close()
        context_summary.close()

    ## write footer and output benchmark information.
    E.Stop()
コード例 #56
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("gff", "fasta", "aa"),
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.format == "gff":
        os.system("grep -v '#' | grep -v '^$'")

    elif options.format == "aa":
        pattern = "Protein"

    elif options.format == "fasta":
        pattern = "DNA"

    # assume that each sequence line does not just contain the
    # amino acids A, C, T or G - THIS IS NOT OPTIMAL
    # This is a list of the amino acids that are not in either "DNA" or "A", "C", "T", "G"
    # This assumes that each sequence (line) will contain at least one of these
    amino_acids = [
        "M", "R", "Q", "E", "H", "I", "L", "K", "F", "P", "S", "W", "Y", "V"
    ]
    result = []
    name = None
    for line in options.stdin.readlines():
        if not line.startswith("##") or line.find("date") != -1 or line.find(
                "gff") != -1 or line.find("source") != -1:
            continue
        data = line[2:-1]
        if data.startswith("%s" % pattern):
            name = data
            if result:
                options.stdout.write(">%s\n%s\n" %
                                     (prot_name, "".join(result)))
                result = []
        else:
            if pattern == "Protein":
                if "".join(map(str, [data.find(x) != -1 for x in amino_acids
                                     ])).find("True") != -1 and data.find(
                                         "end") == -1 and data.find(
                                             "%s" % pattern):
                    result.append(data)
                    prot_name = name
            elif pattern == "DNA":
                if name:
                    if "".join(
                            map(str, [data.find(x) != -1 for x in amino_acids
                                      ])).find("True") == -1 and data.find(
                                          "end") == -1 and data.find(
                                              "%s" % pattern):
                        result.append(data)
                        prot_name = name
    if result:
        options.stdout.write(">%s\n%s\n" % (prot_name, "".join(result)))

    # write footer and output benchmark information.
    E.Stop()
コード例 #57
0
        elif o in ("--version", ):
            print "version="
            sys.exit(0)
        elif o in ("-h", "--help"):
            print USAGE
            sys.exit(0)
        elif o in ("-o", "--file-output"):
            param_filename_output = a

    # 1. read multiple alignment in fasta format
    mali, identifiers = MaliIO.readFasta(sys.stdin)

    if param_loglevel >= 1:
        print "# read mali with %i entries." % len(identifiers)

    print E.GetHeader()
    print E.GetParams()

    # 1. remove gaps in multiple alignment

    mali = MaliIO.removeGaps(mali)

    if param_master:
        frame_columns = GetFrameColumns(mali, param_master)
    elif param_master_pattern:
        columns = []
        for id in identifiers:
            if re.search(param_master_pattern, id):
                columns += GetFrameColumns(mali, id)

        if len(columns) == 0:
コード例 #58
0
ファイル: mali2summary.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2summary.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na"),
        help="alphabet to use [default=%default].",
    )

    parser.add_option(
        "-p",
        "--pattern-mali",
        dest="pattern_mali",
        type="string",
        help="filename pattern for input multiple alignment files.")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        mask_chars="nN",
        gap_chars="-.",
        alphabet="na",
        pattern_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.pattern_mali:
        prefix_header = "prefix\t"
        prefix_row = "\t"
    else:
        prefix_header = ""
        prefix_row = ""

    options.stdout.write(
        "%sncol_mean\tpcol_mean\tncol_median\tpcol_median\tnrow_mean\tprow_mean\tnrow_median\tprow_median\n"
        % (prefix_header, ))

    ninput, nskipped, noutput, nempty = 0, 0, 0, 0

    if options.pattern_mali:

        ids, errors = IOTools.ReadList(sys.stdin)

        E.debug("read %i identifiers.\n" % len(ids))

        nsubstitutions = len(re.findall("%s", options.pattern_mali))

        for id in ids:

            filename = options.pattern_mali % tuple([id] * nsubstitutions)
            ninput += 1

            if not os.path.exists(filename):
                nskipped += 1
                continue

            ## read multiple alignment in various formats
            mali = Mali.Mali()
            mali.readFromFile(open(filename, "r"), format=options.input_format)

            if mali.isEmpty():
                nempty += 1
                continue

            E.debug("read mali with %i entries from %s.\n" %
                    (len(mali), filename))

            if analyzeMali(mali, options, prefix_row="%s\t" % id):
                noutput += 1

    else:

        ## read multiple alignment in various formats
        mali = Mali.Mali()
        mali.readFromFile(sys.stdin, format=options.input_format)
        ninput += 1

        if mali.isEmpty():
            nempty += 1
        else:
            E.debug("read mali with %i entries." % (len(mali)))

            if analyzeMali(mali, options, prefix_row=""):
                noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, nempty=%i." %
           (ninput, noutput, nskipped, nempty))

    E.Stop()
コード例 #59
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-f",
        "--id-format",
        dest="id_format",
        type="string",
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file [%default].")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(options.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #60
0
ファイル: split_genome.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_genome.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-c",
                      "--chunk-size",
                      dest="chunk_size",
                      help="size of chunks in nucleotides.",
                      type="int")
    parser.add_option("-o",
                      "--filename-pattern-output",
                      dest="filename_pattern_output",
                      help="filename for output (should contain one '%i').",
                      type="string")

    parser.set_defaults(
        chunk_size=200000,
        filename_pattern_output="%i.fasta",
        width=100,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    nchunk = 0
    chunksize = 0
    pos = 0
    fragments = []
    outfile = None

    for line in sys.stdin:

        is_header = line[0] == ">"

        if is_header or chunksize > options.chunk_size:

            if outfile:
                rest = Print(outfile, fragments, options)
                chunksize = len(rest)
                pos -= chunksize
                fragments = [rest]
                outfile.close()

            else:
                fragments = []
                chunksize = 0

            nchunk += 1

            outfile = IOTools.openFile(
                options.filename_pattern_output % nchunk, "w")

            if is_header:
                description = line[1:-1]
                id = re.split("\s", description)[0]
                pos = 0

            outfile.write(">%s|%i|%i %s\n" % (id, nchunk, pos, description))

            if is_header:
                continue

        s = re.sub("\s", "", line[:-1])
        l = len(s)
        pos += l
        chunksize += l
        fragments.append(s)

    if outfile:
        rest = Print(outfile, fragments, options)
        outfile.close()