def export_naiveseqs_stat(gene, sequences):
    result = []

    for sequence in sequences:
        result.append({
            'Accession': sequence['Accession'],
            'PMID': sequence['PubMedID'],
            'Gene': gene,
            'Subtype': sequence['Subtype'],
            'NumAAChanges': len(sequence['Mutations']),
            'NumInsertions': sequence['NumInsertions'],
            'NumDeletions': sequence['NumDeletions'],
            'NumStopCodons': sequence['NumStopCodons'],
            'NumApobecs': sequence['NumApobecs'],
            'NumUnusuals': sequence['NumUnusuals'],
            'NumFrameShifts': sequence['NumFrameShifts'],
        })
    csv_writer(
        os.path.join(ROOT, 'data', 'naiveStudies',
                     '{}StatBySeq.csv'.format(gene)), result,
        [
            'Accession', 'PMID', 'Gene', 'Subtype', 'NumAAChanges',
            'NumInsertions', 'NumDeletions', 'NumStopCodons', 'NumApobecs',
            'NumUnusuals', 'NumFrameShifts'
        ])
Beispiel #2
0
def main():
    with Pool(max(1, cpu_count() - 2)) as pool:
        for gene in ('gag', 'gp41'):
            result = calc_distances(gene, pool)
            csv_writer(os.path.join(
                ROOT, 'local', 'naiveStudies',
                '{}NaiveDistance.csv'.format(gene)
            ), result, ['Sequence1', 'Sequence2', 'Distance'])
def export_unusuals(gene, sequences):
    result = []
    for seq in sequences:
        result.append({
            'Accession': seq['Accession'],
            'NumUnusuals': seq['NumUnusuals']
        })
    csv_writer(
        os.path.join(ROOT, 'internalFiles', 'naiveStudies',
                     '{}Unusuals.csv'.format(gene)), result,
        ['Accession', 'NumUnusuals'])
def export_aa_prevalence(gene, ptseqs):
    major_subtypes = [None] + get_most_common_subtypes(gene)
    header = ['Gene', 'Subtype', 'Pos', 'AA', 'Pcnt', 'Count', 'PosTotal']
    all_prevalence = []
    for subtype in major_subtypes:
        prevs = aggregate_aa_prevalence(gene, ptseqs, subtype).values()
        all_prevalence.append(prevs)
    csv_writer(
        os.path.join(ROOT, 'data',
                     'naiveStudies', '{}AAPrevalence.csv'.format(gene)),
        chain(*all_prevalence), header)
def export_papers_table(filename, rows):
    results = []
    for row in rows:
        numpt = row['NumLANLIsolatesQCPassed']
        if not numpt:
            continue
        results.append({
            'PubMedID': row['PubMedID'],
            'PubYear': row['PubYear'],
            'NumPatients': numpt,
            'RxStatus': row['RxStatus'],
            'Title': row['Title'],
            'Subtypes': row['Subtypes'],
            'Authors': row['Authors'],
        })
    csv_writer(filename, results, CLEAN_TABLE_HEADERS)
def export_adindex(gene, sequences):
    apobecs = {}
    for apobec in possible_apobecs_reader(gene):
        pos = int(apobec['Position'])
        apobecs.setdefault(pos, set()).add(apobec['AAChange'].split('=>',
                                                                    1)[1])
    conserveds = len(apobecs)

    result = []
    for sequence in sequences:
        num_apobecs = sequence['NumApobecs']
        index = num_apobecs / conserveds
        result.append({
            'Accession': sequence['Accession'],
            'NumAPOBECs': num_apobecs,
            'NumConservedAPOBECSites': conserveds,
            'ADIndex': index  # APOBEC-mediated defectives index
        })

    csv_writer(
        os.path.join(ROOT, 'internalFiles', 'naiveStudies', 'apobec',
                     '{}NaiveADIndex.csv'.format(gene)), result,
        ['Accession', 'NumAPOBECs', 'NumConservedAPOBECSites', 'ADIndex'])
Beispiel #7
0
def main():
    def cc_keyfunc(c):
        return int(c['PID']), c['Rx'], c['Pos']

    for gene in ('gag', 'gp41'):

        csv_writer(
            os.path.join(ROOT, 'internalFiles', 'aaChangesByPosWPrev',
                         '{}.csv'.format(gene)),
            chain(
                aggregate_aa_changes_by_pos(gene, 'PIs', 'PIs'),
                aggregate_aa_changes_by_pos(gene, 'NNRTIs', 'NNRTIs'),
            ),
            ['Group', 'Pos', 'PreAA', 'PostAA',
             'NumPts', 'PrePrev', 'PostPrev', 'Fold', 'LogFold'])

        csv_writer(
            os.path.join(ROOT, 'internalFiles', 'codonChangesByPt',
                         '{}.csv'.format(gene)),
            sorted(
                codon_changes_per_person(gene, ('PIs', 'NNRTIs')),
                key=cc_keyfunc),
            ['PID', 'Rx', 'Pos', 'Type', 'Codons', 'NumNAChanges', 'AAs'])
def find_possible_apobecs(gene, ptseqs):
    filename = os.path.join(ROOT, 'data', 'naiveStudies', 'apobec',
                            '{}PossibleApobecs.csv'.format(gene))
    apobecs = Counter()
    profile = aggregate_aa_prevalence(gene, ptseqs)
    for seq in ptseqs:
        naseq = seq['AlignedNASequence']
        # search for all positions has GG=>AG or GG=>AA change
        # deletion gaps should be also considered
        matches = re.finditer('A-*(?=[AG])', naseq)
        muts = {m['Position']: m for m in seq['Mutations']}
        start_codon_apobec_changed = False
        if 1 in muts:
            first = muts[1]
            start_codon_apobec_changed = (first['ReferenceText'] == 'M'
                                          and 'I' in first['AminoAcidText'])
        if not start_codon_apobec_changed and not seq['NumStopCodons']:
            # no M=>I and no W=>* changes
            continue

        if seq['NumStopCodons']:
            for mut in seq['Mutations']:
                if '*' in mut['AminoAcidText']:
                    cons = mut['ReferenceText']
                    if cons != 'W':
                        continue
                    aa_pos = mut['Position']
                    cons_prev = profile[(aa_pos, cons)]['Pcnt']
                    if cons_prev < 97.5:
                        # skip non-conserved position
                        continue
                    apobecs[(aa_pos, 'W=>*')] += 1

        for match in matches:
            start, end = match.span(0)
            # na2 = naseq[end]
            aa_pos = start // 3 + 1
            na_offset = start % 3
            if aa_pos not in muts:
                continue

            mut = muts[aa_pos]
            cons = mut['ReferenceText']

            if mut['IsPartial'] or mut['IsInsertion'] or mut['IsDeletion']:
                continue

            if '*' in mut['AminoAcidText']:
                continue

            cons_prev = profile[(aa_pos, cons)]['Pcnt']
            if cons_prev < 97.5:
                # skip non-conserved position
                continue

            codon = mut['CodonText']
            for source in get_codons(cons):
                # find G=>A hypermutation
                if source[na_offset] != 'G':
                    continue
                target = source[:na_offset] + 'A' + source[na_offset + 1:]
                if compare_codon(target, codon):
                    target_aa = translate_codon(target)
                    if target_aa == cons:
                        # do not add things like "E=>E"
                        break
                    apobecs[(aa_pos, '{}=>{}'.format(cons, target_aa))] += 1
                    break

    possible_apobecs = []
    for (pos, mut), count in apobecs.most_common():
        possible_apobecs.append({
            'Position':
            pos,
            'AAChange':
            mut,
            'Consensus %':
            profile[pos, mut.split('=>', 1)[0]]['Pcnt'].quantize(PREC1),
            '# with Stop':
            count
        })

    for seq in ptseqs:
        muts = {m['Position']: m for m in seq['Mutations']}
        for apobec in possible_apobecs:
            pos = apobec['Position']
            aa = apobec['AAChange'].split('=>', 1)[1]
            if pos in muts and aa in muts[pos]['AminoAcidText']:
                apobec['# Sequence'] = apobec.get('# Sequence', 0) + 1
    for apobec in possible_apobecs:
        apobec['% with Stop'] = Decimal(apobec['# with Stop'] * 100 /
                                        apobec['# Sequence']).quantize(PREC0)
    possible_apobecs = sorted(possible_apobecs,
                              key=lambda a: (a['Position'], a['AAChange']))
    possible_apobecs = [a for a in possible_apobecs if a['% with Stop'] > 50]

    csv_writer(
        filename,
        possible_apobecs,
        ['Position', 'AAChange', 'Consensus %', '% with Stop', '# Sequence'],
        writer_options={'extrasaction': 'ignore'})
def create_review_table(gene, ptseqs):
    fact_table = get_fact_table(gene)
    grouped = groupby(ptseqs, lambda s: s['_PubID'])
    results = {}
    for pubid, group_seqs in grouped:
        group_seqs = list(group_seqs)
        subtypes = sorted({s['Subtype'] for s in group_seqs})
        seq = group_seqs[0]
        fact = fact_table.get(pubid, {})
        if fact.get('PubIDCorrection'):
            pubid = fact['PubIDCorrection']

        if pubid not in results:
            results[pubid] = {
                'PubID':
                pubid,
                'PubMedID':
                fact.get('PMID') or seq['PubMedID'],
                'PubYear':
                fact.get('PubYr') or seq['PubYear'],
                'NumPts':
                fact.get('NumPts'),
                'NumIsolates':
                fact.get('NumIsolates'),
                'NumLANLIsolates':
                len(group_seqs),
                'NumLANLIsolatesQCPassed':
                len([s for s in group_seqs if s['Included']]),
                'Title':
                seq['Title'],
                'Authors':
                seq['Authors'],
                'Subtypes':
                '; '.join(subtypes),
                'RxStatus':
                fact.get('RxStatus'),
                'Notes':
                fact.get('Notes'),
            }
        else:
            result = results[pubid]
            origsubtypes = result['Subtypes'].split('; ')
            subtypes = sorted(set(origsubtypes + subtypes))
            result['Subtypes'] = '; '.join(subtypes)
            if not result.get('PubYear'):
                result['PubYear'] = fact.get('PubYr') or seq['PubYear']
            if not result.get('PubMedID'):
                result['PubMedID'] = fact.get('PMID') or seq['PubMedID']
            num_pts = int(result['NumPts'] or 0)
            num_pts += int(fact.get('NumPts') or 0)
            if num_pts:
                result['NumPts'] = str(num_pts)
            num_isos = int(result['NumIsolates'] or 0)
            num_isos += int(fact.get('NumIsolates') or 0)
            if num_isos:
                result['NumIsolates'] = str(num_isos)
            result['NumLANLIsolates'] += len(group_seqs)
            result['NumLANLIsolatesQCPassed'] += len(
                [s for s in group_seqs if s['Included']])
            if not result.get('RxStatus'):
                result['RxStatus'] = fact.get('RxStatus')

    results = sorted(results.values(),
                     key=lambda r: (-r['NumLANLIsolates'], r['PubID']))
    csv_writer(
        os.path.join(ROOT, 'internalFiles', 'papersReview',
                     '{}ReviewTable.csv'.format(gene)), results,
        REVIEW_TABLE_HEADERS)
    export_excel_table(
        os.path.join(ROOT, 'internalFiles', 'papersReview',
                     '{}ReviewTable.xlsx'.format(gene)), results)
    export_papers_table(
        os.path.join(ROOT, 'data', 'naiveStudies',
                     '{}Studies.csv'.format(gene)), results)
Beispiel #10
0
def export_naive_sequences(gene, ptseqs):
    filename = os.path.join(ROOT, 'internalFiles', 'naiveStudies',
                            '{}.csv'.format(gene.lower()))
    aligned_fasta = os.path.join(ROOT, 'data', 'naiveStudies',
                                 '{}NaiveAligned.fas'.format(gene.lower()))
    unaligned_fasta = os.path.join(ROOT, 'data', 'naiveStudies',
                                   '{}NaiveOriginal.fas'.format(gene.lower()))
    indels_csv = os.path.join(ROOT, 'data', 'naiveStudies',
                              '{}NaiveIndels.csv'.format(gene.lower()))

    genesize = int(CONSENSUS[gene]['Size'])
    siteheaders = ['P{}'.format(i) for i in range(1, genesize + 1)]

    rows = []
    indels = []
    ptseqs = sorted(ptseqs, key=lambda s: s['Accession'])
    for seq in ptseqs:
        firstaa = seq['FirstAA']
        lastaa = seq['LastAA']
        muts = {m['Position']: m for m in seq['Mutations']}
        row = {
            'PMID': seq['PubMedID'],
            'Accession': seq['Accession'],
            'RxStatus': 'Naive',
            'lanlSubtype': seq['Subtype'],
            'NumAAChanges': len(muts),
            'NumInsertions': seq['NumInsertions'],
            'NumDeletions': seq['NumDeletions'],
            'NumStopCodons': seq['NumStopCodons'],
            'NumApobecs': seq['NumApobecs'],
            'NumUnusuals': seq['NumUnusuals'],
            'NumFrameShifts': seq['NumFrameShifts']
        }
        for pos in range(1, genesize + 1):
            pname = 'P{}'.format(pos)
            if pos < firstaa or pos > lastaa:
                row[pname] = '.'
            elif pos not in muts:
                row[pname] = '-'
            else:
                mut = muts[pos]
                if mut['IsInsertion']:
                    row[pname] = 'i'
                elif mut['IsDeletion']:
                    row[pname] = 'd'
                elif mut['IsPartial']:
                    row[pname] = 'X'
                else:
                    aas = mut['AminoAcidText']
                    if len(aas) > 4:
                        aas = 'X'
                    row[pname] = aas
        rows.append(row)
        for mut in seq['Mutations']:
            if mut['IsPartial'] or not (mut['IsInsertion']
                                        or mut['IsDeletion']):
                continue
            isins = mut['IsInsertion']
            indels.append({
                'Accession':
                seq['Accession'],
                'Gene':
                gene,
                'Position':
                mut['Position'],
                'IndelType':
                'ins' if isins else 'del',
                'Codon':
                mut['CodonText'] if isins else '',
                'InsertedCodons':
                mut['InsertedCodonsText'] if isins else ''
            })

    csv_writer(filename, rows, [
        'PMID', 'Accession', 'RxStatus', 'lanlSubtype', 'NumAAChanges',
        'NumInsertions', 'NumDeletions', 'NumStopCodons', 'NumApobecs',
        'NumUnusuals', 'NumFrameShifts'
    ] + siteheaders)

    csv_writer(indels_csv, indels, [
        'Accession', 'Gene', 'Position', 'IndelType', 'Codon', 'InsertedCodons'
    ])

    data_writer(
        aligned_fasta,
        '\n'.join('>{Accession}|{Subtype}\n{AlignedNASequence}'.format(**s)
                  for s in ptseqs))

    data_writer(
        unaligned_fasta,
        '\n'.join('>{Accession}|{Subtype}\n{NASequence}'.format(**s)
                  for s in ptseqs))
    print('- {} naive {} sequences were exported'.format(len(ptseqs), gene))