Ejemplo n.º 1
0
def var_parser(outputfile, emu_out):
    '''output: pmid, string, mutation_entry'''
    '''mutation integration'''
    with open(outputfile, 'r') as f:
        mutation_results = [i.split('\n') for i in f.read().split('\n\n') if i]
    emu = defaultdict(lambda: defaultdict(str))
    for pmid, string, norm in emu_to_tmvar(emu_out):
        emu[int(pmid)][string] = norm
    for article_result in mutation_results:
        text = ''
        pmid = False
        mutation_entry_ls = []
        for record in article_result:
            is_text = re.match(r'(\d+)\|a\|(.+)$', record)
            if is_text:
                # id, string
                text = is_text.group(2)
                pmid = int(is_text.group(1))
            else:
                rpmid, start, end, string, Mtype, norm = record.split('\t')
                start = int(start)
                end = int(end)
                if re.search('[A-T]\d+[A-T]', string.upper()):
                    if re.match('c', norm):
                        norm = norm.replace('c', 'p', 1)
                try:
                    my_nm = normalization.mutation_string_normalizition(
                        string, norm)
                except ValueError as e:
                    print('{0}\t{1:10}'.format(e, 'skip'))
                    continue
                mutation_entry_ls.append(
                    Base.BioEntry(start, end, 'mutation', string, norm, my_nm))
        for string in emu[pmid].keys():
            emu_nm = emu[pmid][string]
            is_exist = 0
            for tmvar_mut in mutation_entry_ls:
                tm_nm = tmvar_mut.id
                if emu_nm in tm_nm or tm_nm in emu_nm:  # 重复
                    is_exist = 1
                    break
            if not is_exist:
                '''add mutation entry'''
                for start, end in __emu_get_pos(string, text):
                    my_nm = normalization.mutation_string_normalizition(
                        string, emu_nm)
                    mutation_entry_ls.append(
                        Base.BioEntry(start, end, 'mutation', string, emu_nm,
                                      my_nm))
        yield (pmid, text, mutation_entry_ls)
Ejemplo n.º 2
0
def __GNormPlus_parser_old(outputdir):
    '''output: pmid, gene_entry, article_part'''
    gnor = normalization.gene_normor('../data/hgnc_complete_set.txt')
    for root, dirs, files in os.walk(outputdir):
        for fn in files:
            with open(os.path.join(root, fn), 'r') as f:
                records = [i for i in f.read().split('\n') if i]
                for r in records:
                    is_text = re.match(r'\d+\|a\|(.+)$', r)
                    if is_text:
                        continue
                    else:
                        pmid, start, end, string, type, ids = r.split('\t')
                        if not re.match('gene|protein', type, re.IGNORECASE):
                            continue
                        pmid = int(pmid)
                        start = int(start)
                        end = int(end)
                        for did in ids.split(';'):
                            '''one tring map to mutil-gene'''
                            try:
                                norm = gnor.norm(did)
                            except KeyError as e:
                                print('gene {0} skip: no gene in hgnc'.format(
                                    string))
                                continue
                            yield (pmid,
                                   Base.BioEntry(start, end, type, string, id,
                                                 norm), fn)
Ejemplo n.º 3
0
def GNormPlus_parser(outputfile):
    gnor = normalization.gene_normor('../data/hgnc_complete_set.txt')
    with open(outputfile, 'r') as f:
        gene_results = [i.split('\n') for i in f.read().split('\n\n') if i]
    for article_result in gene_results:
        text = ''
        pmid = False
        mutation_entry_ls = []
        for record in article_result:
            is_text = re.match(r'(\d+)\|a\|(.+)$', record)
            if is_text:
                # id, string
                text = is_text.group(2)
                pmid = int(is_text.group(1))
            else:
                pmid, start, end, string, _type, ids = record.split('\t')
                if not re.match('gene|protein', _type, re.IGNORECASE):
                    continue
                pmid = int(pmid)
                start = int(start)
                end = int(end)
                for did in ids.split(';'):
                    '''one tring map to mutil-gene'''
                    try:
                        norm = gnor.norm(did)
                    except KeyError as e:
                        print('gene {0} skip: no gene in hgnc'.format(string))
                        continue
                    yield (pmid,
                           Base.BioEntry(start, end, _type, string, id, norm))
Ejemplo n.º 4
0
def DNorm_parser(outputfile):
    '''output: pmid, disease_entry'''
    with open(outputfile, 'r') as f:
        dnorm_results = [i.split('\t') for i in f.read().split('\n') if i]
    for i in dnorm_results:
        if len(i) == 5:
            pmid, start, end, string, id = i
        else:
            continue
            # pmid, start, end, string = i
            # id = 'null'
        pmid = int(pmid)
        start = int(start)
        end = int(end)
        yield (pmid, Base.BioEntry(start, end, 'disease', string, id))
Ejemplo n.º 5
0
def GNormPlus_parser(outputdir):
    '''output: pmid, gene_entry, article_part'''
    for root, dirs, files in os.walk(outputdir):
        for fn in files:
            with open(os.path.join(root, fn), 'r') as f:
                records = [i for i in f.read().split('\n') if i]
                for r in records:
                    is_text = re.match(r'\d+\|a\|(.+)$', r)
                    if is_text:
                        continue
                    else:
                        pmid, start, end, string, type, id = r.split('\t')
                        if not re.match('gene|protein', type, re.IGNORECASE):
                            continue
                        pmid = int(pmid)
                        start = int(start)
                        end = int(end)
                        yield (pmid, Base.BioEntry(start, end, type, string,
                                                   id), fn)
Ejemplo n.º 6
0
def tmvar_parser(outputfile):
    '''output: pmid, string, mutation_entry'''
    with open(outputfile, 'r') as f:
        mutation_results = [i.split('\n') for i in f.read().split('\n\n') if i]
    for article_result in mutation_results:
        text = ''
        pmid = False
        mutation_entry_ls = []
        for record in article_result:
            is_text = re.match(r'(\d+)\|a\|(.+)$', record)
            if is_text:
                # id, string
                text = is_text.group(2)
                pmid = int(is_text.group(1))
            else:
                rpmid, start, end, string, Mtype, norm = record.split('\t')
                start = int(start)
                end = int(end)
                mutation_entry_ls.append(
                    Base.BioEntry(start, end, 'mutation', string, norm))
        yield (pmid, text, mutation_entry_ls)