def read_pe_values(lines): """read all combined p-values and e-values""" result = [] current_index = next_pe_value_line(0, lines) while current_index != -1: gene = lines[current_index - 2].strip() line = lines[current_index] pvalue = float(util.extract_regex('P-VALUE\s+=\s+(\S+)', line)) evalue = float(util.extract_regex('E-VALUE\s+=\s+(\S+)', line)) result.append((gene, pvalue, evalue)) current_index = next_pe_value_line(current_index + 1, lines) return result
def read_annotations(lines, genes): """extract annotations, genes are given as refseq ids""" result = {} current_index = next_pe_value_line(0, lines) while current_index != -1: gene = lines[current_index - 2].strip() if gene in genes: info_line = lines[current_index] length = int( util.extract_regex('LENGTH\s+=\s+(\d+)', info_line)) has_seqalign_block = True diagram_match = re.match('^\s+DIAGRAM:\s+(\d+)$', lines[current_index + 1]) if diagram_match is not None: diagram = int(diagram_match.group(1)) if diagram == length: has_seqalign_block = False if has_seqalign_block: # the diagram line can span several lines and the blank # line after those can span several, so search for the # first non-blank line after the block of blank lines blank_index = current_index + 2 while len(lines[blank_index].strip()) > 0: blank_index += 1 non_blank_index = blank_index + 1 while len(lines[non_blank_index].strip()) == 0: non_blank_index += 1 result[gene] = read_seqalign_blocks( lines, non_blank_index, length) current_index = next_pe_value_line(current_index + 1, lines) return result
def read_annotations(lines, genes): """extract annotations, genes are given as refseq ids""" result = {} current_index = next_pe_value_line(0, lines) while current_index != -1: gene = lines[current_index - 2].strip() if gene in genes: info_line = lines[current_index] length = int(util.extract_regex('LENGTH\s+=\s+(\d+)', info_line)) has_seqalign_block = True diagram_match = re.match('^\s+DIAGRAM:\s+(\d+)$', lines[current_index + 1]) if diagram_match is not None: diagram = int(diagram_match.group(1)) if diagram == length: has_seqalign_block = False if has_seqalign_block: # the diagram line can span several lines and the blank # line after those can span several, so search for the # first non-blank line after the block of blank lines blank_index = current_index + 2 while len(lines[blank_index].strip()) > 0: blank_index += 1 non_blank_index = blank_index + 1 while len(lines[non_blank_index].strip()) == 0: non_blank_index += 1 result[gene] = read_seqalign_blocks(lines, non_blank_index, length) current_index = next_pe_value_line(current_index + 1, lines) return result
def extract_evalue(infoline): """extract the e-value from the info line""" return float(util.extract_regex('E-value =\s+\S+', infoline))
def extract_llr(infoline): """extract the llr value from the info line""" return int(util.extract_regex('llr =\s+\d+', infoline))
def extract_num_sites(infoline): """extract the sites value from the info line""" return int(util.extract_regex('sites =\s+\d+', infoline))
def extract_width(infoline): """extract the width value from the info line""" return int(util.extract_regex('width =\s+\d+', infoline))