def splicer(gff, ftype, dline): seq=dict() roots = [line for line in gff.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')] for root in roots: #if ftype[0] == 'CDS' and root['type'] == 'pseudogene': # pseudogene should not contain cds #continue rid = 'NA' if root['attributes'].has_key('ID'): rid = root['attributes']['ID'] children = root['children'] for child in children: cid = 'NA' if child['attributes'].has_key('ID'): cid = child['attributes']['ID'] cname = cid if child['attributes'].has_key('Name'): cname = child['attributes']['Name'] defline='>{0:s}'.format(cid) if ftype[0] == 'CDS': defline='>{0:s}-CDS'.format(cid) if dline == 'complete': defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{4:s}({8:s})|Parent={5:s}|ID={6:s}|Name={7:s}'.format(child['seqid'], child['start'], child['end'], child['strand'], child['type'], rid, cid, cname, ftype[0]) segments = [] gchildren = child['children'] for gchild in gchildren: if gchild['type'] in ftype: segments.append(gchild) flag = 0 if len(segments)==0: flag += 1 for gchild in gchildren: if gchild['type'] == 'CDS': segments.append(gchild) if len(segments)==0 and ftype[0] == 'CDS': flag += 1 print("WARNING There is no CDS feature for {0:s} in the input gff. The sequence of {0:s} is not generated.".format(cid)) continue elif len(segments)==0: flag += 1 print("WARNING There is no exon, nor CDS feature for {0:s} in the input gff. The sequence of {0:s} is not generated.".format(cid)) continue if flag == 1: print("WARNING There is no exon feature for {0:s} in the input gff. CDS features are used for splicing instead.".format(cid)) sort_seg = function4gff.featureSort(segments) if gchild['strand'] == '-': sort_seg = function4gff.featureSort(segments, reverse=True) tmpseq = '' for s in sort_seg: tmpseq = tmpseq + get_subseq(gff, s) seq[defline] = tmpseq return seq
def check_duplicate(gff, linelist): ''' This function assumes that, 1. Each gnee is unique 2. Children features such as Exons/CDSs do not contain multiple Parent IDs Note: If there are additional transcript type in the input gfff, then you should go to intra_model.featureSort, and add the new transcript type to the dict of FEATURECODE. ''' eCode = 'Emr0001' eSet = list() pairs = list() for i in range(len(linelist)-1): for j in range(i+1, len(linelist)): source, target = linelist[i], linelist[j] if source['seqid'] == target['seqid']: s7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(source['seqid'], source['source'], source['type'], source['start'], source['end'], source['score'], source['strand'], source['phase']) t7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(target['seqid'], target['source'], target['type'], target['start'], target['end'], target['score'], target['strand'], target['phase']) if s7 == t7: pairs.append({'source':source, 'target':target}) for pair in pairs: result = dict() same_target = False if pair['source'].has_key('children') and pair['target'].has_key('children'): schildren = pair['source']['children'] tchildren = pair['target']['children'] if len(schildren) == len(tchildren): sort_schildren = function4gff.featureSort(schildren, reverse=True if pair['source']['strand'] == '-' else False) sort_tchildren = function4gff.featureSort(tchildren, reverse=True if pair['source']['strand'] == '-' else False) for i in range(len(sort_schildren)): s7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(sort_schildren[i]['seqid'], sort_schildren[i]['source'], sort_schildren[i]['type'], sort_schildren[i]['start'], sort_schildren[i]['end'], sort_schildren[i]['score'], sort_schildren[i]['strand'], sort_schildren[i]['phase']) t7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(sort_tchildren[i]['seqid'], sort_tchildren[i]['source'], sort_tchildren[i]['type'], sort_tchildren[i]['start'], sort_tchildren[i]['end'], sort_tchildren[i]['score'], sort_tchildren[i]['strand'], sort_tchildren[i]['phase']) if s7 == t7: same_target=True else: same_target=False break if same_target: key = [pair['source']['attributes']['ID'], pair['target']['attributes']['ID']] result['ID'] = key result['eCode'] = eCode result['eLines'] = [pair['source'], pair['target']] result['eTag'] = 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID']) eSet.append(result) gff.add_line_error(pair['source'], {'message': 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID']), 'error_type': 'INTER_MODEL', 'eCode': eCode}) gff.add_line_error(pair['target'], {'message': 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID']), 'error_type': 'INTER_MODEL', 'eCode': eCode}) if len(eSet): return eSet