Example #1
0
def main():
    req_args = ['virus annotation file',
                '% MSA cutoff']
    examples = ['../../Data/ProfileScan/hiv.prosite',
                '90']
    utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)
    
    annotation_file = sys.argv[1]
    conserved_cutoff = float(sys.argv[2])
    
    [virus2annotation, virus2proteinCount] = getCounts(annotation_file)

    for vp in virus2annotation.keys():
        for pattern in virus2annotation[vp]:
            percent = (float(100) * float(len(virus2annotation[vp][pattern])) / 
                       float(len(virus2proteinCount[vp])))
            if percent >= conserved_cutoff:
                print vp + '\t0\t0\t' + pattern + '\tseq\tELM'
            sys.stderr.write(vp + '\t' + pattern + '\t' + str(percent) + '\n')
Example #2
0
def main():
    req_args = ['virus annotation file',
                'annotation tool',
                '% MSA cutoff']
    examples = ['../../Data/ProfileScan/hiv.prosite',
                'ProfileScan',
                '90']
    utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)
    
    annotation_file = sys.argv[1]
    tool = sys.argv[2]
    conserved_cutoff = float(sys.argv[3])
    
    protein2annotation = utils_motif.protein2annotation(annotation_file,
                                                        {tool:True})
    [virus2annotation, virus2proteinCount] = getCounts(protein2annotation)

    for vp in virus2annotation.keys():
        for motif in virus2annotation[vp].keys():
            percent = (float(100) * float(virus2annotation[vp][motif]) / 
                       float(virus2proteinCount[vp]))
            if percent >= conserved_cutoff:
                print vp + '\t0\t0\t' + motif + '\tseq\t' + tool
            sys.stderr.write(vp + '\t' + motif + '\t' + str(percent) + '\n')
Example #3
0
            for s in seq:
                tempSeq = tempSeq + s
            offset = 0
            while match:
                for elm in pattern2elm[elm_pattern]:
                    printResult(protein, elm, 
                                match, tempSeq, offset)
                tempSeq = tempSeq[int(match.start())+1:]
                offset += int( match.start() ) + 1
                match = p.search(tempSeq)

req_args = ['pattern file',
            'fasta file']
examples = ['../../Data/ELM/elm2pattern',
            '../../Data/FASTA/Human/hprd.intr.fasta']
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

input_pattern_file = sys.argv[1]
fasta_file = sys.argv[2]
pattern2regex = {}
pattern2elm = defaultdict(dict)
with open(input_pattern_file) as f:
    for line in f:
        elm, pattern = line.strip().split('\t')
        pattern2elm[pattern][elm] = True
for pattern in pattern2elm:
    pattern2regex[pattern] = re.compile(pattern)

for protein, seq in utils.fasta_iter(fasta_file):
    matchSeq(protein, seq, pattern2elm, pattern2regex)