def parseRm4( file ): """Parses readmatcher -printFormat 4 output into AlignmentHit objects""" for line in open(file): fields = line.rstrip("\n").split(" ") hit = AlignmentHit() hit.query_id = fields[0] hit.target_id = fields[1] hit.score = - int(fields[2]) hit.pctidentity = float(fields[3]) hit.query_strand = "+" if fields[4] == "0" else "-" hit.query_start = int(fields[5]) hit.query_end = int(fields[6]) hit.query_length = int(fields[7]) # for negative strand readMatcher query coords are reported on reverse # complement of the sequence. For the alignmenthit they need to be # reported on the forward strand if hit.query_strand == "-": hit.query_end, hit.query_start = hit.query_length - hit.query_start, hit.query_length - hit.query_end hit.target_strand = "+" if fields[8] == "0" else "-" hit.target_start = int(fields[9]) hit.target_end = int(fields[10]) hit.target_length = int(fields[11]) if hit.target_strand == "-": hit.target_end, hit.target_start = hit.target_length - hit.target_start, hit.target_length - hit.target_end #if hit.target_strand == hit.query_strand: # always report strand for query # hit.target_strand = hit.query_strand = "+" #else: # hit.query_strand = "-" # hit.target_strand = "+" hit.alignedLength = abs(hit.target_end-hit.target_start) yield hit
def parseRm5( file ): """Parses readmatcher -printFormat 5 output into AlignmentHit objects""" for line in open(file): #print line values = line.rstrip("\n").split(" ") hit = AlignmentHit() hit.query_id, hit.query_length, hit.query_start, hit.query_end, hit.query_strand = values[0], int(values[1]), int(values[2]), int(values[3]), values[4] hit.target_id, hit.target_length, hit.target_start, hit.target_end, hit.target_strand = values[6], int(values[7]), int(values[8]), int(values[9]), values[10] hit.score = -1*int(values[11]) #target_id = values[6] because there is white space in the m5 file before target_id hit.query_id = "/".join(hit.query_id.split("/")[0:3]) hit.alignedQuery = values[17] hit.QueryStrOrg = hit.alignedQuery tempQList = [] tempQList = list(hit.alignedQuery) for i in range(0, len(tempQList)): if tempQList[i] == 'A' or tempQList[i] == 'C' or tempQList[i] == 'G' or tempQList[i] == 'T': hit.QuerySeq.append(tempQList[i]) hit.QueryStr = ''.join(hit.QuerySeq) hit.alignedTarget = values[19] hit.TargetStrOrg = hit.alignedTarget tempTList = [] tempTList = list(hit.alignedTarget) for i in range(0, len(tempTList)): if tempTList[i] == 'A' or tempTList[i] == 'C' or tempTList[i] == 'G' or tempTList[i] == 'T': hit.TargetSeq.append(tempTList[i]) hit.TargetStr = ''.join(hit.TargetSeq) hit.aligned = values[18] hit.line = line #print hit.target_strand if hit.target_strand == "+": hit.target_strand = 0 else: hit.target_strand = 1 if hit.query_strand == "+": hit.query_strand = 0 else: hit.query_strand = 1 hit.revcomp() tempRevQList = [] tempRevQList = list(hit.alignedQuery) for i in range(0, len(tempRevQList)): if tempRevQList[i] == 'A' or tempRevQList[i] == 'C' or tempRevQList[i] == 'G' or tempRevQList[i] == 'T': hit.RevQuerySeq.append(tempRevQList[i]) hit.RevQueryStr = ''.join(hit.RevQuerySeq) tempRevTList = [] tempRevTList = list(hit.alignedTarget) for i in range(0, len(tempRevTList)): if tempRevTList[i] == 'A' or tempRevTList[i] == 'C' or tempRevTList[i] == 'G' or tempRevTList[i] == 'T': hit.RevTargetSeq.append(tempRevTList[i]) hit.RevTargetStr = ''.join(hit.RevTargetSeq) yield hit