def CmalignScoreParser(lines): """Parser for tabfile format cmalign score result. - IMPORTANT: Will only parse standard output from cmalign. - NOTE: Will only work with search result files with a single CM as a query. Will not work with multiple alignment result files that have been concatenated. - Result will be list of hits with following order: [seq idx, seq name, seq len, total bit score, struct bit score, avg prob, elapsed time] """ # Converting indices and %GC to integers and bit score to float. # Since E-value is only present if CM is calibrated, leaving as string. conversion_fields = [(0,int),(2,int),(3,float),(4,float),(5,float)] cmalign_score_converter = ConvertFields(conversion_fields) #Ignore hash characters good_lines = [] for l in lines: line = l.strip() if line.startswith('# STOCKHOLM 1.0'): break if line and (not line.startswith('#')): good_lines.append(l) #make parser cmalign_score_parser = SeparatorFormatParser(with_header=False,\ converter=cmalign_score_converter,\ ignore=None,\ sep=None) return cmalign_score_parser(good_lines)
def CmsearchParser(lines): """Parser for tabfile format cmsearch result. - IMPORTANT: Will not parse standard output from cmsearch. You must use --tabfile with cmsearch to get correct format to use this parser. - NOTE: Will only work with search result files with a single CM as a query. Will not work with multiple search result files that have been concatenated. - Result will be list of hits with following order: [target name, target start, target stop, query start, query stop, bit score, E-value, GC%] """ # Converting indices and %GC to integers and bit score to float. # Since E-value is only present if CM is calibrated, leaving as string. conversion_fields = [(2,int),(3,int),(4,int),(5,int),(6,float),(8,int)] cmsearch_converter = ConvertFields(conversion_fields) #Ignore hash characters good_lines = [] for l in lines: if not l.startswith('#'): good_lines.append(l) #make parser cmsearch_parser = SeparatorFormatParser(with_header=False,\ converter=cmsearch_converter,\ ignore=None,\ sep=None) return cmsearch_parser(good_lines)
def MinimalBedParser(data, converter=converter): """returns data lines from a BED file NOTE: BED uses 0-based numbering""" # If given a filename for the data if type(data) == str: if data.endswith('.bed.gz'): data = gzip.GzipFile(data, 'rb') else: data = open(data, 'r') header_lines = 0 data_lines = [] for row in data: if not row.startswith('chr'): header_lines += 1 else: data_lines.append(row) parser = SeparatorFormatParser(converter=converter, with_header=False, sep="\t") for row in parser(data_lines): yield row
def MinimalSamParser(data, converter=converter): """returns records from a sam file NOTE: the default converter turns the 1-based numbering of POS into 0-based numbering""" # If given a filename for the data if type(data) == str: data = open(data) # get the lengths dict lengths = {} header_lines = 0 for row in data: header_lines += 1 if not row.startswith('@'): yield lengths break elif not row.startswith('@SQ'): continue line = row.split()[1:] name = line[0].split(':')[1] length = int(line[1].split(':')[1]) lengths[name] = length # reset file pointer and move to first data line data.seek(0) for i, line in enumerate(data): if i == header_lines - 2: break parser = SeparatorFormatParser(converter=converter, with_header=False, sep="\t") for row in parser(data): yield row