コード例 #1
0
ファイル: infernal.py プロジェクト: mikerobeson/pycogent
def CmalignScoreParser(lines):
    """Parser for tabfile format cmalign score result.
    
        - IMPORTANT: Will only parse standard output from cmalign.
                
        - NOTE: Will only work with search result files with a single CM
            as a query.  Will not work with multiple alignment result files
            that have been concatenated.
        
        - Result will be list of hits with following order:
        [seq idx, seq name, seq len, total bit score, struct bit score,
            avg prob, elapsed time]
        
    """
    # Converting indices and %GC to integers and bit score to float.
    # Since E-value is only present if CM is calibrated, leaving as string.
    conversion_fields = [(0,int),(2,int),(3,float),(4,float),(5,float)]
    cmalign_score_converter = ConvertFields(conversion_fields)
    #Ignore hash characters
    good_lines = []
    for l in lines:
        line = l.strip()
        if line.startswith('# STOCKHOLM 1.0'):
            break
        if line and (not line.startswith('#')):
            good_lines.append(l)
    #make parser
    cmalign_score_parser = SeparatorFormatParser(with_header=False,\
                                            converter=cmalign_score_converter,\
                                            ignore=None,\
                                            sep=None)
    
    return cmalign_score_parser(good_lines)
コード例 #2
0
ファイル: infernal.py プロジェクト: mikerobeson/pycogent
def CmsearchParser(lines):
    """Parser for tabfile format cmsearch result.
    
        - IMPORTANT: Will not parse standard output from cmsearch.  You must
            use --tabfile with cmsearch to get correct format to use this
            parser.
        
        - NOTE: Will only work with search result files with a single CM
            as a query.  Will not work with multiple search result files
            that have been concatenated.
        
        - Result will be list of hits with following order:
        [target name, target start, target stop, query start, query stop,
            bit score, E-value, GC%]
        
    """
    # Converting indices and %GC to integers and bit score to float.
    # Since E-value is only present if CM is calibrated, leaving as string.
    conversion_fields = [(2,int),(3,int),(4,int),(5,int),(6,float),(8,int)]
    cmsearch_converter = ConvertFields(conversion_fields)
    #Ignore hash characters
    good_lines = []
    for l in lines:
        if not l.startswith('#'):
            good_lines.append(l)
    #make parser
    cmsearch_parser = SeparatorFormatParser(with_header=False,\
                                            converter=cmsearch_converter,\
                                            ignore=None,\
                                            sep=None)
    
    return cmsearch_parser(good_lines)
コード例 #3
0
ファイル: bed.py プロジェクト: cameron-jack/Chippy
def MinimalBedParser(data, converter=converter):
    """returns data lines from a BED file

    NOTE: BED uses 0-based numbering"""
    # If given a filename for the data
    if type(data) == str:
        if data.endswith('.bed.gz'):
            data = gzip.GzipFile(data, 'rb')
        else:
            data = open(data, 'r')

    header_lines = 0
    data_lines = []

    for row in data:
        if not row.startswith('chr'):
            header_lines += 1
        else:
            data_lines.append(row)

    parser = SeparatorFormatParser(converter=converter,
                                   with_header=False,
                                   sep="\t")

    for row in parser(data_lines):
        yield row
コード例 #4
0
ファイル: sam.py プロジェクト: cameron-jack/Chippy
def MinimalSamParser(data, converter=converter):
    """returns records from a sam file

    NOTE: the default converter turns the 1-based numbering of POS into
    0-based numbering"""
    # If given a filename for the data
    if type(data) == str:
        data = open(data)

    # get the lengths dict
    lengths = {}
    header_lines = 0

    for row in data:
        header_lines += 1
        if not row.startswith('@'):
            yield lengths
            break
        elif not row.startswith('@SQ'):
            continue
        line = row.split()[1:]
        name = line[0].split(':')[1]
        length = int(line[1].split(':')[1])
        lengths[name] = length

    # reset file pointer and move to first data line
    data.seek(0)
    for i, line in enumerate(data):
        if i == header_lines - 2:
            break

    parser = SeparatorFormatParser(converter=converter,
                                   with_header=False,
                                   sep="\t")

    for row in parser(data):
        yield row