Beispiel #1
0
    def __init__(self,
                 file=None,
                 rules_yaml=None,
                 genotype=None,
                 references=None):
        """ Load ASI rules from a file or file object. """

        if references is None:
            projects = ProjectConfig.loadDefault()
            references = projects.getAllReferences()
            with WILD_TYPES_PATH.open() as wild_types_file:
                wild_types = safe_load(wild_types_file)
            references.update(wild_types)
        self.stds = {
            name if name != 'INT' else 'IN': ref
            for name, ref in references.items()}

        # Algorithm info
        self.alg_version = ''
        self.alg_name = ''

        # definitions
        self.gene_def = {}  # {code: [drug_class_code]}
        self.level_def = {}  # {'1': 'Susceptible'}
        self.drug_class = defaultdict(list)  # {code: [drug_code]}
        self.global_range = []  # [ ['-INF', '9', '1'] , ...]  #first two are the range, the third one is the res level
        self.comment_def = {}  # {code: comment_text}

        self.drugs = {}  # {code: (name, [condition, [(action_type, action_value)]])}
        self.mutation_comments = []  # maybe skip for now?  We don't really use this atm.

        if file is not None:
            self.load_xml(file)
        elif rules_yaml is not None:
            self.load_yaml(rules_yaml, genotype)
Beispiel #2
0
def extract_v3loop_ref():
    ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt')
    try:
        with open(ref_filename) as f:
            v3loop_ref = f.read()
    except FileNotFoundError:
        project_config = ProjectConfig.loadDefault()
        hiv_seed = project_config.getReference(G2P_SEED_NAME)
        coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
        v3loop_ref = extract_target(hiv_seed, coordinate_ref)
        with open(ref_filename, 'w') as f:
            f.write(v3loop_ref)
    return v3loop_ref
Beispiel #3
0
def extract_v3loop_ref():
    ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt')
    try:
        with open(ref_filename) as f:
            v3loop_ref = f.read()
    except FileNotFoundError:
        project_config = ProjectConfig.loadDefault()
        hiv_seed = project_config.getReference(G2P_SEED_NAME)
        coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
        v3loop_ref = extract_target(hiv_seed, coordinate_ref)
        with open(ref_filename, 'w') as f:
            f.write(v3loop_ref)
    return v3loop_ref
Beispiel #4
0
def load_references():
    projects = ProjectConfig.loadDefault()
    references = {}  # {(genotype, region): Reference}
    for ref_name, sequence in projects.getAllReferences().items():
        match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name)
        if match:
            genotype = match.group(1)
            region = match.group(2)
            if region in HCV_REGIONS:
                reference = Reference(ref_name, sequence)
                references[(genotype, region)] = reference
                if genotype == '6':
                    references[('6E', region)] = reference
    return references
Beispiel #5
0
def load_references():
    projects = ProjectConfig.loadDefault()
    references = {}  # {(genotype, region): Reference}
    for ref_name, sequence in projects.getAllReferences().items():
        match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name)
        if match:
            genotype = match.group(1)
            region = match.group(2)
            if region in HCV_REGIONS:
                reference = Reference(ref_name, sequence)
                references[(genotype, region)] = reference
                if genotype == '6':
                    references[('6E', region)] = reference
    return references
Beispiel #6
0
def main():
    fastq_files = [
        FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)),
                  (CodonMutation(159, 'GTC'), )),
        FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)),
                  (CodonMutation(159, 'GTC'), )),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)),
                  (CodonMutation(316, 'AGC'), )),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)),
                  (CodonMutation(316, 'AGC'), ))
    ]
    projects = ProjectConfig.loadDefault()
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(
                    projects, section.coord_name, section.start_pos,
                    section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                for mutation in fastq_file.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = (mutation.pos - section.start_pos) * 3
                        ref_nuc_section[section_pos:section_pos + 3] = list(
                            mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * (ref_end - ref_start)
                file_num = '2' if fastq_file.is_reversed else '1'
                for cluster in range(section.count):
                    f.write(
                        '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'
                        .format(fastq_file.extract_num, cluster + next_cluster,
                                file_num))
                    f.write(ref_nuc_section + '\n')
                    f.write('+\n')
                    f.write(phred_scores + '\n')
                next_cluster += section.count
Beispiel #7
0
def read_contigs(contigs_csv, excluded_seeds=None):
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    contig_groups = defaultdict(
        list)  # {group_ref_name: [seq, index, index...]}
    conseqs = {}
    projects = ProjectConfig.loadDefault()
    with contigs_csv:
        contigs_reader = DictReader(contigs_csv)
        for i, row in reversed(list(enumerate(contigs_reader, 1))):
            contig_seq = row['contig']
            match_fraction = float(row['match'])
            is_match = 0.25 <= match_fraction
            is_reversed = match_fraction < 0
            if not (ARE_CONTIGS_MERGED and is_match):
                contig_name = get_contig_name(i, row['ref'], is_match,
                                              is_reversed, excluded_seeds)
                conseqs[contig_name] = contig_seq
                continue
            group_ref_name = row['group_ref']
            contig_group = contig_groups[group_ref_name]
            if not contig_group:
                contig_group.append(projects.getReference(group_ref_name))
            contig_group.append(str(i))
            group_seq = contig_group[0]
            agroup, acontig, score = align_it(group_seq, contig_seq,
                                              gap_open_penalty,
                                              gap_extend_penalty,
                                              use_terminal_gap_penalty)
            match = re.match('-*([^-](.*[^-])?)', acontig)
            start = match.start(1)
            end = match.end(1)
            merged_seq = agroup[:start] + contig_seq + agroup[end:]
            left_trim = len(agroup) - len(agroup.lstrip('-'))
            right_trim = len(agroup) - len(agroup.rstrip('-'))
            contig_group[0] = merged_seq[left_trim:-right_trim or None]

    is_match = True
    is_reversed = False
    for group_ref_name, contig_group in contig_groups.items():
        (group_seq, *contig_nums) = contig_group
        prefix = '_'.join(reversed(contig_nums))
        contig_name = get_contig_name(prefix, group_ref_name, is_match,
                                      is_reversed, excluded_seeds)
        conseqs[contig_name] = group_seq
    return conseqs
Beispiel #8
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0,
              merged_contigs_csv=None):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    consensus_builder = ConsensusBuilder()
    counted_reads = consensus_builder.build(merged_reads)
    trimmed_reads = trim_reads(counted_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
    if merged_contigs_csv is not None:
        contig_writer = DictWriter(merged_contigs_csv, ['contig'])
        contig_writer.writeheader()
        for consensus in consensus_builder.get_consensus_by_lengths():
            unambiguous_consensus = consensus.replace('N', '').replace('-', '')
            if unambiguous_consensus:
                contig_writer.writerow(dict(contig=consensus))
Beispiel #9
0
def main():
    project_config = ProjectConfig.loadDefault()
    error_count = 0
    unchecked_ref_names = set(project_config.getAllReferences().keys())
    error_count += check_hcv_seeds(project_config, unchecked_ref_names)
    error_count += check_hcv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_seeds(project_config, unchecked_ref_names)
    error_count += check_hiv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_wild_types(project_config)
    error_count += check_hla_seeds(project_config, unchecked_ref_names)
    error_count += check_hla_coordinates(project_config, unchecked_ref_names)

    if not unchecked_ref_names:
        print('No unchecked refs.')
    else:
        print(fill_report(f'Unchecked refs: '
                          f'{", ".join(sorted(unchecked_ref_names))}'))
        error_count += len(unchecked_ref_names)
    print(f'Total errors: {error_count}.')
Beispiel #10
0
def test_duplicated_sars_base_amino(sequence_report):
    """ Special case for duplicated base in SARS orf1ab.

    Expect amino sequence AQSFLNRVCG.
    """

    # refname,qcut,rank,count,offset,seq
    aligned_reads = prepare_reads("""\
SARS-CoV-2-seed,15,0,9,0,GCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACAC
""")
    # Repeat is here:                     ^

    #                                       A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,...,coverage
    expected_text = """\
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,1,4396,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,4,4397,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,7,4398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,10,4399,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,13,4400,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,16,4401,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,18,4402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,4403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,4404,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,4405,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9"""
    sequence_report.projects = ProjectConfig.loadDefault()
    orf1ab_size = len(
        sequence_report.projects.getReference('SARS-CoV-2-ORF1ab'))
    nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12'))

    report_file = StringIO()
    sequence_report.write_amino_header(report_file)
    sequence_report.read(aligned_reads)
    sequence_report.write_amino_counts()

    report = report_file.getvalue()
    report_lines = report.splitlines()
    expected_size = orf1ab_size + nsp12_size + 1
    if len(report_lines) != expected_size:
        assert (len(report_lines), report) == (expected_size, '')

    key_lines = report_lines[4396:4406]
    key_report = '\n'.join(key_lines)
    assert key_report == expected_text
Beispiel #11
0
def test_duplicated_sars_base_nuc(sequence_report):
    """ Make sure duplicated base in SARS isn't duplicated in nuc.csv. """

    # refname,qcut,rank,count,offset,seq
    aligned_reads = prepare_reads("""\
SARS-CoV-2-seed,15,0,9,10,ACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCG
""")

    #                  A,C,G,T,N,...,coverage
    expected_section = """\
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,0,9,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,0,0,0,9,0,0,0,0,0,9"""
    sequence_report.projects = ProjectConfig.loadDefault()
    orf1ab_size = len(
        sequence_report.projects.getReference('SARS-CoV-2-ORF1ab'))
    nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12'))

    report_file = StringIO()
    sequence_report.write_nuc_header(report_file)
    sequence_report.read(aligned_reads)
    sequence_report.write_nuc_counts()

    report = report_file.getvalue()
    report_lines = report.splitlines()
    header_size = 1
    skipped_rows = 2
    expected_size = (orf1ab_size + nsp12_size) * 3 + header_size - skipped_rows
    if len(report_lines) != expected_size:
        assert (len(report_lines), report) == (expected_size, '')

    key_lines = report_lines[13198:13209]
    key_report = '\n'.join(key_lines)
    assert key_report == expected_section
def main():
    # find_best_match_for_pssm()
    sequences = fetch_alignment_sequences(
        2004,
        'CON',  # Consensus/Ancestral
        'ENV')
    consensus = sequences['CON_OF_CONS'].replace('-', '').upper()

    project_config = ProjectConfig.loadDefault()
    ref_names = set(project_config.getAllReferences().keys())
    new_sequences = fetch_alignment_sequences('2015', 'COM')
    consensus_accession = 'Consensus'
    assert consensus_accession not in new_sequences, sorted(
        new_sequences.keys())
    new_sequences[consensus_accession] = consensus

    for line in compare_config('HIV', project_config, new_sequences,
                               ref_names):
        print(line, end='')

    print('Unchecked refs: ' + ', '.join(sorted(ref_names)))
Beispiel #13
0
    def __init__(self,
                 file=None,
                 rules_yaml=None,
                 genotype=None,
                 references=None,
                 backup_genotype=None):
        """ Load ASI rules from a file or file object. """

        if references is None:
            projects = ProjectConfig.loadDefault()
            references = projects.getAllReferences()
            with WILD_TYPES_PATH.open() as wild_types_file:
                wild_types = safe_load(wild_types_file)
            references.update(wild_types)
        self.stds = {
            name if name != 'INT' else 'IN': ref
            for name, ref in references.items()
        }

        # Algorithm info
        self.alg_version = ''
        self.alg_name = ''

        # definitions
        self.gene_def = {}  # {code: [drug_class_code]}
        self.level_def = {}  # {'1': 'Susceptible'}
        self.drug_class = defaultdict(list)  # {code: [drug_code]}
        self.global_range = [
        ]  # [ ['-INF', '9', '1'] , ...]  #first two are the range, the third one is the res level
        self.comment_def = {}  # {code: comment_text}

        self.drugs = {
        }  # {code: (name, [condition, [(action_type, action_value)]])}
        self.mutation_comments = [
        ]  # maybe skip for now?  We don't really use this atm.

        if file is not None:
            self.load_xml(file)
        elif rules_yaml is not None:
            self.load_yaml(rules_yaml, genotype, backup_genotype)
Beispiel #14
0
def write_nuc_mutations(nuc_csv: typing.TextIO,
                        nuc_mutations_csv: typing.TextIO):
    nuc_rows = DictReader(nuc_csv)
    mutations_writer = DictWriter(nuc_mutations_csv,
                                  ['seed',
                                   'region',
                                   'wt',
                                   'refseq_nuc_pos',
                                   'var',
                                   'prevalence'],
                                  lineterminator=os.linesep)
    mutations_writer.writeheader()
    for seed, seed_rows in groupby(nuc_rows, itemgetter('seed')):
        if seed != 'SARS-CoV-2-seed':
            continue
        landmark_reader = LandmarkReader.load()
        projects = ProjectConfig.loadDefault()
        for region_name, region_rows in groupby(seed_rows, itemgetter('region')):
            region = landmark_reader.get_gene(seed, region_name)
            seed_seq = projects.getReference(seed)
            ref_seq = seed_seq[region['start']-1:region['end']]
            for row in region_rows:
                nuc_pos = int(row['refseq.nuc.pos'])
                wild_type = ref_seq[nuc_pos-1]
                coverage = int(row['coverage'])
                if coverage == 0:
                    continue
                for nuc in 'ACGT':
                    if nuc == wild_type:
                        continue
                    nuc_count = int(row[nuc])
                    prevalence = nuc_count / coverage
                    if prevalence >= 0.05:
                        mutations_writer.writerow(dict(seed=seed,
                                                       region=region_name,
                                                       wt=wild_type,
                                                       refseq_nuc_pos=nuc_pos,
                                                       var=nuc,
                                                       prevalence=prevalence))
Beispiel #15
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    trimmed_reads = trim_reads(merged_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts,
                                          aligned_csv,
                                          hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
Beispiel #16
0
def main():
    project_config = ProjectConfig.loadDefault()
    error_count = 0
    unchecked_ref_names = set(project_config.getAllReferences().keys())
    error_count += check_hcv_seeds(project_config, unchecked_ref_names)
    error_count += check_hcv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_seeds(project_config, unchecked_ref_names)
    error_count += check_hiv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_wild_types(project_config)
    error_count += check_hla_seeds(project_config, unchecked_ref_names)
    error_count += check_hla_coordinates(project_config, unchecked_ref_names)
    error_count += check_sars_seeds(project_config, unchecked_ref_names)
    error_count += check_sars_coordinates(project_config, unchecked_ref_names)

    if not unchecked_ref_names:
        print('No unchecked refs.')
    else:
        print(
            fill_report(f'Unchecked refs: '
                        f'{", ".join(sorted(unchecked_ref_names))}'))
        error_count += len(unchecked_ref_names)
    print(f'Total errors: {error_count}.')
def load_hcv(seqs):
    hcv_definitions = DictReader(
        StringIO("""\
protocol,name,direction,length,h77_pos,sequence
HCV WG,oligo dA20,R,20,9418-9437,AAAAAAAAAAAAAAAAAAAA
,Pr3,R,30,8616-8645,GGCGGAATTCCTGGTCATAGCCTCCGTGAA
,1abGENF1bp,F,28,266-293,GGGTCGCGAAAGGCCTTGTGGTACTGCC
,TIM-Pr3,R,30,8616-8645,CAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA
,1abGENF2,F,30,286-315,GTACTGCCTGATAGGGTGCTTGCGAGTGCC
,Pr6,R,30,8611-8640,AATTCCTGGTCATAGCCTCCGTGAAGACTC
HCV miDi,Pr1,F,31,8245-8275,TGGGGTTCGCGTATGATACCCGCTGCTTTGA
,Pr2,F,31,8245-8275,TGGGGTTTTCTTACGACACCAGGTGCTTTGA
,oligo dA20-TIM,R,20,9418-9437,CAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA
,Pr4,F,29,8253-8281,CCGTATGATACCCGCTGCTTTGACTCAAC
,Pr5,F,29,8253-8281,TCCTACGACACCAGGTGCTTTGATTCAAC
,TIM,R,,1-0,CAGGAAACAGCTATGAC
"""))
    projects = ProjectConfig.loadDefault()
    h77 = projects.getReference('HCV-1a')
    is_comparing = True
    differ = Differ()
    for row in hcv_definitions:
        name = 'HCV ' + row['name']
        start, end = (int(pos) for pos in row['h77_pos'].split('-'))
        primer = SeqRecord(Seq(row['sequence']), name, description='')
        complement = primer.reverse_complement(id=primer.id, description='')
        direction = row['direction']
        if direction == 'F':
            seqs['left'].append(primer)
        else:
            seqs['right'].append(primer)
            primer, complement = complement, primer
        h77_section = Seq(h77[start - 1:end])
        if is_comparing and primer.seq != h77_section:
            print(name, 'does not match.')
            diffs = differ.compare([str(primer.seq) + '\n'],
                                   [str(h77_section) + '\n'])
            print(*diffs, sep='')
Beispiel #18
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    trimmed_reads = trim_reads(merged_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
Beispiel #19
0
def main():
    args = parse_args()
    project_config = ProjectConfig.loadDefault()
    scoring_path = Path(__file__).parent.parent / 'project_scoring.json'
    with scoring_path.open() as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(args.qai_server, args.qai_user, args.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     args.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                args.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        # noinspection PyTypeChecker
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config['regions'].items(
        ):
            ref_seq = ''.join(region_data['reference'])
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ref_seq,
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region
            elif region['reference'] != ref_seq:
                print("Reference doesn't match:", region_name)
                if args.update_sequences:
                    region['reference'] = ref_seq
                    session.post_json(f"/lab_miseq_regions/{region['id']}",
                                      region)

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': args.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].items():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print("Done.")
Beispiel #20
0
def build_conseqs(conseqs_file,
                  run,
                  sample_sheet,
                  ok_sample_regions):
    """
    Parses a Pipeline-produced conseq file and builds JSON objects to send
    to QAI.

    @param conseqs_file: An open file that contains the consensus sequences
        from the counts2csf step for all samples in the run.
    @param run: a hash with the attributes of the run record, including a
        sequencing summary of all the samples and their target projects
    @param sample_sheet: The data parsed from the sample sheet.
    @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that
        were given a good score by the pipeline.
    @return an array of JSON hashes, one for each conseq.
    """

    result = []
    ss = sample_sheet
    sequencings = run['sequencing_summary']
    conseqs_csv = csv.DictReader(conseqs_file)
    # ss["Data"] is keyed by (what should be) the FASTQ
    # filename, which looks like
    #
    # [sample name with ; and _ replaced by -]_S[sample number].
    #
    # Meanwhile, entries in conseqs_file have a "sample" field holding
    # just the sample name (also with ; and _ replaced).  We make a
    # lookup table to get the FASTQ filename just from the first part.
    # This will make subsequent steps easier (avoids having to do a
    # search through a list/dict of dicts).
    # FASTQ_lookup = {}
    # filename_re = re.compile("(.+)_S.+")
    # for fastq_filename in ss["Data"]:
    #     sample_name = filename_re.match(fastq_filename).group(1)
    #     FASTQ_lookup[sample_name] = fastq_filename

    projects = ProjectConfig.loadDefault()
    target_regions = set()  # set([(project_name, tags)])
    for entry in sequencings:
        seeds = projects.getProjectSeeds(entry['target_project'])
        for seed in seeds:
            target_regions.add((entry['tag'], seed))

    for row in conseqs_csv:
        # Each row of this file looks like:
        # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence
        # We want to take the "sample" entry and get the corresponding
        # original Sample_Name from the sample sheet. In version 2, this
        # looks like [sample name]~[project name]#[...]
        # In version 1, this looked like [sample name]~[project name]#[...]
        # but both ; and _ got garbled by the MiSeq instrument itself.
        # Thus we have to work around it.
        fastq_filename = row["sample"]
        sample_info = ss["Data"][fastq_filename]
        orig_sample_name = sample_info["orig_sample_name"]
        sample_tags = sample_info["tags"]
        # FIXME if row["sequence"] is blank we replace it with a dash.
        # Need Conan to make that row blank-able.
        curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-"
        sample_region = (fastq_filename, row["region"], row["q-cutoff"])
        ok_region = sample_region in ok_sample_regions
        is_target_region = (sample_tags, row["region"]) in target_regions
        ok_for_release = ok_region and is_target_region
        result.append({"samplename": orig_sample_name,
                       # July 9, 2014: we can't do this properly right now
                       # without a lookup table that is yet to be fully
                       # defined.
                       "testcode": None,
                       "conseq_cutoff": row["consensus-percent-cutoff"],
                       "region": row["region"],
                       "qcutoff": float(row["q-cutoff"]),
                       "snum": fastq_filename.split('_')[-1],
                       "seq": curr_seq,
                       "ok_for_release": ok_for_release})
    return result
Beispiel #21
0
def aln2counts(aligned_csv,
               nuc_csv,
               amino_csv,
               coord_ins_csv,
               conseq_csv,
               failed_align_csv,
               callback=None,
               coverage_summary_csv=None,
               clipping_csv=None,
               conseq_ins_csv=None,
               g2p_aligned_csv=None,
               remap_conseq_csv=None):
    """
    Analyze aligned reads for nucleotide and amino acid frequencies.
    Generate consensus sequences.
    @param aligned_csv:         Open file handle containing aligned reads (from sam2aln)
    @param nuc_csv:             Open file handle to write nucleotide frequencies.
    @param amino_csv:           Open file handle to write amino acid frequencies.
    @param coord_ins_csv:       Open file handle to write insertions relative to coordinate reference.
    @param conseq_csv:          Open file handle to write consensus sequences.
    @param failed_align_csv:    Open file handle to write sample consensus sequences that failed to
                                align to the coordinate reference.
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param coverage_summary_csv: Open file handle to write coverage depth.
    @param clipping_csv: Open file handle containing soft clipping counts
    @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence
    @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p)
    @param remap_conseq_csv: Open file handle containing consensus sequences
        from the remap step.
    """
    # load project information
    projects = ProjectConfig.loadDefault()

    # initialize reporter classes
    insert_writer = InsertionWriter(coord_ins_csv)
    report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS)
    report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
    report.write_amino_header(amino_csv)
    report.write_consensus_header(conseq_csv)
    report.write_failure_header(failed_align_csv)
    report.write_nuc_header(nuc_csv)
    if coverage_summary_csv is None:
        coverage_summary = coverage_writer = None
    else:
        coverage_writer = csv.DictWriter(
            coverage_summary_csv,
            ['avg_coverage', 'coverage_region', 'region_width'],
            lineterminator=os.linesep)
        coverage_writer.writeheader()
        coverage_summary = {}

    if callback:
        aligned_filename = getattr(aligned_csv, 'name', None)
        if aligned_filename:
            file_size = os.stat(aligned_filename).st_size
            report.enable_callback(callback, file_size)

    if clipping_csv is not None:
        report.read_clipping(clipping_csv)
    if conseq_ins_csv is not None:
        report.read_insertions(conseq_ins_csv)
    if remap_conseq_csv is not None:
        report.read_remap_conseqs(remap_conseq_csv)

    report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary)

    if coverage_summary_csv is not None:
        if coverage_summary:
            coverage_writer.writerow(coverage_summary)
Beispiel #22
0
def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None):
    """ Use Blastn to search for the genotype of a set of reference sequences.

    :param str fasta: file path of the FASTA file containing the query
        sequences
    :param str db: file path of the database to search for matches
    :param blast_csv: open file to write the blast matches to, or None
    :param dict group_refs: {contig_ref: group_ref} or None. The dictionary
        will get filled in with the mapping from each contig's reference name
        to the best matched reference for the whole seed group.
    :return: {query_name: (ref_name, matched_fraction)} where query_name is a
        sequence header from the query sequences FASTA file, ref_name is the
        name of the best match from the database, and matched_fraction is the
        fraction of the query that aligned against the reference (matches and
        mismatches).
    """
    contig_nums = {}  # {contig_name: contig_num}
    with open(fasta) as f:
        for line in f:
            if line.startswith('>'):
                contig_name = line[1:-1]
                contig_nums[contig_name] = len(contig_nums) + 1
    blast_columns = [
        'qaccver', 'saccver', 'pident', 'score', 'qcovhsp', 'qstart', 'qend',
        'sstart', 'send'
    ]
    cline = NcbiblastnCommandline(query=fasta,
                                  db=db,
                                  outfmt=f'"10 {" ".join(blast_columns)}"',
                                  evalue=0.0001,
                                  gapopen=5,
                                  gapextend=2,
                                  penalty=-3,
                                  reward=1,
                                  max_target_seqs=5000)
    stdout, _ = cline()
    samples = {}  # {query_name: (subject_name, matched_fraction)}
    matches = sorted(DictReader(StringIO(stdout), blast_columns),
                     key=lambda row: (row['qaccver'], float(row['score'])))
    if not blast_csv:
        blast_writer = None
    else:
        blast_writer = DictWriter(blast_csv, [
            'contig_num', 'ref_name', 'score', 'match', 'pident', 'start',
            'end', 'ref_start', 'ref_end'
        ],
                                  lineterminator=os.linesep)
        blast_writer.writeheader()
    contig_top_matches = {
        match['qaccver']: match['saccver']
        for match in matches
    }
    top_refs = set(contig_top_matches.values())
    projects = ProjectConfig.loadDefault()
    match_scores = Counter()
    for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')):
        contig_top_ref = contig_top_matches[contig_name]
        contig_seed_group = projects.getSeedGroup(contig_top_ref)
        for match in contig_matches:
            ref_name = match['saccver']
            if ref_name not in top_refs:
                continue
            match_seed_group = projects.getSeedGroup(ref_name)
            if match_seed_group == contig_seed_group:
                match_scores[ref_name] += float(match['score'])

    if group_refs is not None:
        group_top_refs = {
            projects.getSeedGroup(ref_name): ref_name
            for ref_name, count in reversed(match_scores.most_common())
        }
        for ref_name in contig_top_matches.values():
            group_refs[ref_name] = group_top_refs[projects.getSeedGroup(
                ref_name)]

    for match in matches:
        matched_fraction = float(match['qcovhsp']) / 100
        if int(match['send']) < int(match['sstart']):
            matched_fraction *= -1
        pident = round(float(match['pident']))
        contig_name = match['qaccver']
        samples[contig_name] = (match['saccver'], matched_fraction)
        if blast_writer:
            blast_writer.writerow(
                dict(contig_num=contig_nums[contig_name],
                     ref_name=match['saccver'],
                     score=match['score'],
                     match=matched_fraction,
                     pident=pident,
                     start=match['qstart'],
                     end=match['qend'],
                     ref_start=match['sstart'],
                     ref_end=match['send']))
    return samples
Beispiel #23
0
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions):
    """
    Parses a Pipeline-produced conseq file and builds JSON objects to send
    to QAI.

    @param conseqs_file: An open file that contains the consensus sequences
        from the counts2csf step for all samples in the run.
    @param run: a hash with the attributes of the run record, including a
        sequencing summary of all the samples and their target projects
    @param sample_sheet: The data parsed from the sample sheet.
    @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that
        were given a good score by the pipeline.
    @return an array of JSON hashes, one for each conseq.
    """

    result = []
    ss = sample_sheet
    sequencings = run['sequencing_summary']
    conseqs_csv = csv.DictReader(conseqs_file)
    # ss["Data"] is keyed by (what should be) the FASTQ
    # filename, which looks like
    #
    # [sample name with ; and _ replaced by -]_S[sample number].
    #
    # Meanwhile, entries in conseqs_file have a "sample" field holding
    # just the sample name (also with ; and _ replaced).  We make a
    # lookup table to get the FASTQ filename just from the first part.
    # This will make subsequent steps easier (avoids having to do a
    # search through a list/dict of dicts).
    # FASTQ_lookup = {}
    # filename_re = re.compile("(.+)_S.+")
    # for fastq_filename in ss["Data"]:
    #     sample_name = filename_re.match(fastq_filename).group(1)
    #     FASTQ_lookup[sample_name] = fastq_filename

    projects = ProjectConfig.loadDefault()
    target_regions = set()  # set([(tags, seed_name)])
    for entry in sequencings:
        try:
            seeds = projects.getProjectSeeds(entry['target_project'])
        except KeyError:
            logger.warning('Failed to load project seeds.', exc_info=True)
            seeds = set()
        for seed in seeds:
            target_regions.add((entry['tag'], seed))

    for row in conseqs_csv:
        # Each row of this file looks like:
        # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence
        # We want to take the "sample" entry and get the corresponding
        # original Sample_Name from the sample sheet. In version 2, this
        # looks like [sample name]~[project name]#[...]
        # In version 1, this looked like [sample name]~[project name]#[...]
        # but both ; and _ got garbled by the MiSeq instrument itself.
        # Thus we have to work around it.
        fastq_filename = row["sample"]
        sample_info = ss["Data"][fastq_filename]
        orig_sample_name = sample_info["orig_sample_name"]
        sample_tags = sample_info["tags"]
        # FIXME if row["sequence"] is blank we replace it with a dash.
        # Need Conan to make that row blank-able.
        curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-"
        sample_region = (fastq_filename, row["region"], row["q-cutoff"])
        ok_region = sample_region in ok_sample_regions
        is_target_region = (sample_tags, row["region"]) in target_regions
        ok_for_release = ok_region and is_target_region
        result.append({
            "samplename": orig_sample_name,
            # July 9, 2014: we can't do this properly right now
            # without a lookup table that is yet to be fully
            # defined.
            "testcode": None,
            "conseq_cutoff": row["consensus-percent-cutoff"],
            "region": row["region"],
            "qcutoff": float(row["q-cutoff"]),
            "snum": fastq_filename.split('_')[-1],
            "seq": curr_seq,
            "ok_for_release": ok_for_release
        })
    return result
Beispiel #24
0
def main():
    project_config = ProjectConfig.loadDefault()
    with open('../project_scoring.json', 'rU') as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(settings.qai_path, settings.qai_user,
                      settings.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     settings.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                settings.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config[
                'regions'].iteritems():
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ''.join(region_data['reference']),
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': settings.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].iteritems():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print "Done."
Beispiel #25
0
def find_probes(contigs_csv, probes_csv):
    reader = DictReader(contigs_csv)
    columns = ['sample', 'contig']
    for target_name in TARGET_SEQUENCES:
        for column_type in [
                'in_contig_start', 'in_contig_size', 'in_hxb2_start',
                'in_hxb2_size', 'merged_hxb2_start', 'merged_hxb2_size',
                'dist', 'end_dist', 'score', 'is_reversed', 'seq'
        ]:
            columns.append(target_name + '_' + column_type)
    writer = DictWriter(probes_csv, columns)
    writer.writeheader()
    projects = ProjectConfig.loadDefault()
    hxb2 = projects.getReference('HIV1-B-FR-K03455-seed')
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    for sample_name, sample_rows in groupby(reader, itemgetter('sample')):
        contig_num = 0
        for row in sample_rows:
            seed_name = row.get('genotype') or row.get('ref') or row['region']
            conseq_cutoff = row.get('consensus-percent-cutoff')
            if conseq_cutoff and conseq_cutoff != 'MAX':
                continue
            contig_num += 1
            contig_name = f'{contig_num}-{seed_name}'
            contig_seq: str = row.get('contig') or row['sequence']
            aligned_hxb2, aligned_contig_to_hxb2, _ = align_it(
                hxb2, contig_seq, gap_open_penalty, gap_extend_penalty,
                use_terminal_gap_penalty)
            new_row = dict(sample=sample_name, contig=contig_name)
            for target_name, target_seq in TARGET_SEQUENCES.items():
                finder = ProbeFinder(contig_seq, target_seq)

                size = len(finder.contig_match)
                start_pos = finder.start + 1
                end_pos = finder.start + size
                hxb2_pos = contig_pos = 0
                merged_hxb2_start = merged_hxb2_size = None
                for hxb2_nuc, contig_nuc in zip(aligned_hxb2,
                                                aligned_contig_to_hxb2):
                    if hxb2_nuc != '-':
                        hxb2_pos += 1
                    if contig_nuc != '-':
                        contig_pos += 1
                        if contig_pos == start_pos:
                            merged_hxb2_start = hxb2_pos
                        if contig_pos == end_pos:
                            merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1
                            break

                aligned_ref, aligned_match, _ = align_it(
                    hxb2, finder.contig_match, gap_open_penalty,
                    gap_extend_penalty, use_terminal_gap_penalty)
                lstripped_match = aligned_match.lstrip('-')
                in_hxb2_start = len(aligned_match) - len(lstripped_match)
                tail_len = len(lstripped_match) - len(
                    lstripped_match.rstrip('-'))
                ref_match = aligned_ref[in_hxb2_start:-tail_len or None]
                in_hxb2_size = len(ref_match.replace('-', ''))

                prefix = target_name + '_'
                new_row[prefix + 'in_contig_start'] = start_pos
                new_row[prefix + 'in_contig_size'] = size
                new_row[prefix + 'in_hxb2_start'] = in_hxb2_start
                new_row[prefix + 'in_hxb2_size'] = in_hxb2_size
                new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start
                new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size
                new_row[prefix + 'dist'] = finder.dist
                new_row[prefix + 'end_dist'] = finder.end_dist
                new_row[prefix + 'score'] = finder.score
                new_row[prefix +
                        'is_reversed'] = ('Y' if finder.is_reversed else 'N')
                new_row[prefix + 'seq'] = finder.contig_match
            writer.writerow(new_row)
Beispiel #26
0
from micall.utils.alignment_wrapper import align_nucs

try:
    # noinspection PyPackageRequirements
    from mappy import Aligner
except ImportError:
    Aligner = None

from micall.utils.fetch_sequences import fetch_by_accession


import sys
from micall.core.project_config import ProjectConfig

REFERENCE = ProjectConfig.loadDefault()
REFERENCE = REFERENCE.getReference('SARS-CoV-2-seed')

def load_coverage(csv):
    result = {}
    with open(csv) as csvfile:
        reader = DictReader(csvfile)
        for row in reader:
            result[int(row['query_nuc_pos'])] = int(row['coverage'])
    return result

BATCH = 'batch_01'

ROOT = (
    Path('/wow')
    / BATCH
Beispiel #27
0
def build_coverage_figure(genome_coverage_csv, blast_csv=None):
    min_position, max_position = 1, 500
    coordinate_depths = Counter()
    contig_depths = Counter()
    contig_groups = defaultdict(set)  # {coordinates_name: {contig_name}}
    reader = DictReader(genome_coverage_csv)
    for row in reader:
        query_nuc_pos = int(row['query_nuc_pos'])
        if row['refseq_nuc_pos']:
            refseq_nuc_pos = int(row['refseq_nuc_pos'])
        else:
            refseq_nuc_pos = min_position
        min_position = min(min_position, refseq_nuc_pos, query_nuc_pos)
        max_position = max(max_position, refseq_nuc_pos, query_nuc_pos)
        coordinates_name = row['coordinates']
        contig_name = row['contig']
        if row['coverage'] != '':
            row_coverage = int(row['coverage']) - int(row['dels'])
            coordinate_depths[coordinates_name] = max(
                coordinate_depths[coordinates_name], row_coverage)
            contig_depths[contig_name] = max(contig_depths[contig_name],
                                             row_coverage)
        contig_groups[coordinates_name].add(contig_name)
    if '' in coordinate_depths:
        # Force partial contigs to come last.
        coordinate_depths[''] = -1
    position_offset = -min_position + 1
    max_position += position_offset

    blast_rows = []
    if blast_csv is not None:
        for blast_row in DictReader(blast_csv):
            for field_name in ('start', 'end', 'ref_start', 'ref_end'):
                # noinspection PyTypeChecker
                blast_row[field_name] = int(blast_row[field_name])
            blast_rows.append(blast_row)
    blast_rows.sort(key=itemgetter('start', 'ref_start'))

    landmarks_path = (Path(__file__).parent.parent / "data" /
                      "landmark_references.yaml")
    landmark_groups = yaml.safe_load(landmarks_path.read_text())
    projects = ProjectConfig.loadDefault()
    f = Figure()
    for _, coordinates_name in sorted(
        (-depth, name) for name, depth in coordinate_depths.items()):
        for reference_set in landmark_groups:
            if coordinates_name != reference_set['coordinates']:
                continue
            prev_landmark = None
            for i, landmark in enumerate(
                    sorted(reference_set['landmarks'],
                           key=itemgetter('start'))):
                landmark.setdefault('frame', 0)
                if prev_landmark and 'end' not in prev_landmark:
                    prev_landmark['end'] = landmark['start'] - 1
                prev_landmark = landmark
            for frame, frame_landmarks in groupby(reference_set['landmarks'],
                                                  itemgetter('frame')):
                subtracks = []
                for landmark in frame_landmarks:
                    landmark_colour = landmark.get('colour')
                    if landmark_colour is None:
                        continue
                    subtracks.append(
                        Track(landmark['start'] + position_offset,
                              landmark['end'] + position_offset,
                              label=landmark['name'],
                              color=landmark_colour))
                    max_position = max(max_position,
                                       landmark['end'] + position_offset)
                f.add(Multitrack(subtracks))
            break
        else:
            add_partial_banner(f, position_offset, max_position)
        contig_names = contig_groups[coordinates_name]
        sorted_contig_names = sort_contig_names(contig_names, contig_depths)
        ref_arrows = []
        for contig_name in sorted_contig_names:
            if contig_name.startswith('contig-'):
                # No arrows on original contig tracks.
                continue
            contig_matcher = ContigMatcher(contig_name)
            ref_positions = None
            arrow_count = 0
            for blast_row in blast_rows:
                if not contig_matcher.is_match(blast_row):
                    continue
                if (ref_positions is None and coordinates_name != ''
                        and blast_row['ref_name'] != coordinates_name):
                    ref_positions = map_references(blast_row['ref_name'],
                                                   coordinates_name, projects)
                arrow_count += 1
                ref_start = int(blast_row['ref_start'])
                ref_end = int(blast_row['ref_end'])
                if ref_positions is None:
                    coordinate_start = ref_start
                    coordinate_end = ref_end
                else:
                    coordinate_start = ref_positions[ref_start]
                    coordinate_end = ref_positions[ref_end]
                ref_arrows.append(
                    Arrow(coordinate_start + position_offset,
                          coordinate_end + position_offset,
                          elevation=1,
                          label=f'{contig_matcher.num}.{arrow_count}'))
        if ref_arrows:
            f.add(ArrowGroup(ref_arrows))
        for contig_name in sorted_contig_names:
            genome_coverage_csv.seek(0)
            reader = DictReader(genome_coverage_csv)
            build_contig(reader, f, contig_name, max_position, position_offset,
                         blast_rows)

    if not f.elements:
        f.add(Track(1, max_position, label='No contigs found.', color='none'))
    return f
Beispiel #28
0
def main():
    fastq_files = [FastqFile('2130A-HCV_S15_L001_R1_001.fastq',
                             '2130',
                             False,
                             (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)),
                             (CodonMutation(159, 'GTC'),)),
                   FastqFile('2130A-HCV_S15_L001_R2_001.fastq',
                             '2130',
                             True,
                             (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)),
                             (CodonMutation(159, 'GTC'),)),
                   FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq',
                             '2130',
                             False,
                             (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)),
                             (CodonMutation(316, 'AGC'),)),
                   FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq',
                             '2130',
                             True,
                             (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)),
                             (CodonMutation(316, 'AGC'),)),
                   FastqFile('2140A-HIV_S17_L001_R1_001.fastq',
                             '2140',
                             False,
                             (FastqSection('PR', 1, 80, 100),),
                             (CodonMutation(24, 'ATA'),)),
                   FastqFile('2140A-HIV_S17_L001_R2_001.fastq',
                             '2140',
                             True,
                             (FastqSection('PR', 20, 99, 100),),
                             (CodonMutation(24, 'ATA'),))]
    projects = ProjectConfig.loadDefault()
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(projects,
                                                              section.coord_name,
                                                              section.start_pos,
                                                              section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                for mutation in fastq_file.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = (mutation.pos - section.start_pos) * 3
                        ref_nuc_section[section_pos:section_pos+3] = list(mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * (ref_end-ref_start)
                file_num = '2' if fastq_file.is_reversed else '1'
                for cluster in range(section.count):
                    f.write('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'.format(
                        fastq_file.extract_num,
                        cluster + next_cluster,
                        file_num))
                    f.write(ref_nuc_section+'\n')
                    f.write('+\n')
                    f.write(phred_scores+'\n')
                next_cluster += section.count
Beispiel #29
0
def aln2counts(aligned_csv,
               nuc_csv,
               amino_csv,
               coord_ins_csv,
               conseq_csv,
               failed_align_csv,
               callback=None,
               coverage_summary_csv=None,
               clipping_csv=None,
               conseq_ins_csv=None,
               g2p_aligned_csv=None,
               remap_conseq_csv=None,
               conseq_region_csv=None):
    """
    Analyze aligned reads for nucleotide and amino acid frequencies.
    Generate consensus sequences.
    @param aligned_csv:         Open file handle containing aligned reads (from sam2aln)
    @param nuc_csv:             Open file handle to write nucleotide frequencies.
    @param amino_csv:           Open file handle to write amino acid frequencies.
    @param coord_ins_csv:       Open file handle to write insertions relative to coordinate reference.
    @param conseq_csv:          Open file handle to write consensus sequences.
    @param failed_align_csv:    Open file handle to write sample consensus sequences that failed to
                                align to the coordinate reference.
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param coverage_summary_csv: Open file handle to write coverage depth.
    @param clipping_csv: Open file handle containing soft clipping counts
    @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence
    @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p)
    @param remap_conseq_csv: Open file handle containing consensus sequences
        from the remap step.
    @param conseq_region_csv: Open file handle to write consensus sequences
        split into regions.
    """
    # load project information
    projects = ProjectConfig.loadDefault()

    # initialize reporter classes
    with InsertionWriter(coord_ins_csv) as insert_writer:
        report = SequenceReport(insert_writer,
                                projects,
                                CONSEQ_MIXTURE_CUTOFFS)
        report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
        report.write_amino_header(amino_csv)
        report.write_consensus_header(conseq_csv)
        report.write_consensus_regions_header(conseq_region_csv)
        report.write_failure_header(failed_align_csv)
        report.write_nuc_header(nuc_csv)
        if coverage_summary_csv is None:
            coverage_summary = coverage_writer = None
        else:
            coverage_writer = csv.DictWriter(coverage_summary_csv,
                                             ['avg_coverage',
                                              'coverage_region',
                                              'region_width'],
                                             lineterminator=os.linesep)
            coverage_writer.writeheader()
            coverage_summary = {}

        if callback:
            aligned_filename = getattr(aligned_csv, 'name', None)
            if aligned_filename:
                file_size = os.stat(aligned_filename).st_size
                report.enable_callback(callback, file_size)

        if clipping_csv is not None:
            report.read_clipping(clipping_csv)
        if conseq_ins_csv is not None:
            report.read_insertions(conseq_ins_csv)
        if remap_conseq_csv is not None:
            report.read_remap_conseqs(remap_conseq_csv)

        report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary)

        if coverage_summary_csv is not None:
            if coverage_summary:
                coverage_writer.writerow(coverage_summary)
Beispiel #30
0
def main():
    args = parse_args()
    projects = ProjectConfig.loadDefault()
    for sample_name in args.sample:
        process_file(sample_name, projects, args)
    print('Done.')
Beispiel #31
0
def main():
    args = parse_args()
    project_config = ProjectConfig.loadDefault()
    with open('../project_scoring.json', 'rU') as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(args.qai_server,
                      args.qai_user,
                      args.qai_password)

        pipelines = session.get_json(
            "/lab_miseq_pipelines?version=" + args.pipeline_version,
            retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                args.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config['regions'].items():
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions",
                    {'name': region_name,
                     'is_nucleotide': region_data['is_nucleotide'],
                     'reference': ''.join(region_data['reference']),
                     'seed_group_id': seed_group_id})
                regions[region_name] = region

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': args.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config['projects'].items():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects",
                    {'name': project_name,
                     'max_variants': project_data['max_variants']})
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {'pipeline_id': pipeline_id,
                                                 'project_id': project['id']})
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name]['regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions",
                    {'project_version_id': project_version['id'],
                     'coordinate_region_id': coordinate_region['id'],
                     'min_coverage1': scoring_data['min_coverage1'],
                     'min_coverage2': scoring_data['min_coverage2'],
                     'min_coverage3': scoring_data['min_coverage3'],
                     'seed_group_id': seed_group_id})

                for key_position in scoring_data['key_positions']:
                    session.post_json("/lab_miseq_key_positions",
                                      {'project_region_id': project_region['id'],
                                       'start_pos': key_position['start_pos'],
                                       'end_pos': key_position['end_pos']})

    print("Done.")
Beispiel #32
0
def main():
    projects = ProjectConfig.loadDefault()
    sections_2100hcv_1, sections_2100hcv_2 = make_random_sections(
        'HCV1A-H77-NS5a', 1, 300, projects, 400)
    sections_2100v3_1, sections_2100v3_2 = ([
        FastqSection('HIV1-B-FR-K03455-seed', 7056, 7312, 50),
        FastqSection('HIV1-B-FR-K03455-seed', 7062, 7312, 50)
    ], [
        FastqSection('HIV1-B-FR-K03455-seed', 7123, 7373, 50),
        FastqSection('HIV1-B-FR-K03455-seed', 7123, 7376, 50)
    ])
    sections_2100hiv_1, sections_2100hiv_2 = make_random_sections(
        'RT', 1, 300, projects, 400)
    sections_2160_1, sections_2160_2 = make_random_sections(
        'HCV2-JFH-1-NS5b',
        1,
        230,
        projects,
        mutations=(CodonMutation(159, 'GTC'), ))
    sections_2160midi_1, sections_2160midi_2 = make_random_sections(
        'HCV2-JFH-1-NS5b',
        231,
        561,
        projects,
        mutations=(CodonMutation(316, 'AGC'), ))
    sections_2170_1a_1, sections_2170_1a_2 = make_random_sections(
        'HCV-1a', 6258, 9375)
    sections_2170_2_1, sections_2170_2_2 = make_random_sections(
        'HCV-2a', 6269, 9440)
    sections_2180_1, sections_2180_2 = make_random_sections(
        'HIV1-B-FR-K03455-seed', 6225, 7757)
    hxb2_ref = projects.getReference('HIV1-B-FR-K03455-seed')

    projects.config['regions']['HXB2-with-deletion'] = dict(
        reference=hxb2_ref[617:928] + hxb2_ref[9358:9652],
        is_nucleotide=True,
        seed_group=None)
    sections_2210_1, sections_2210_2 = make_random_sections(
        'HXB2-with-deletion', projects=projects)
    fastq_files = [
        FastqFile('2010A-V3LOOP_S3_L001_R1_001.fastq', '2010', False,
                  (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10),
                   FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))),
        FastqFile('2010A-V3LOOP_S3_L001_R2_001.fastq', '2010', True,
                  (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10),
                   FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))),
        FastqFile('2020A-GP41_S4_L001_R1_001.fastq', '2020', False,
                  (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10,
                                (CodonMutation(6981, 'GGGATA'), )), )),
        FastqFile('2020A-GP41_S4_L001_R2_001.fastq', '2020', True,
                  (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10,
                                (CodonMutation(6981, 'GGGATA'), )), )),
        FastqFile('2040A-HLA-B_S6_L001_R1_001.fastq', '2040', False,
                  (FastqSection('HLA-B-seed', 201, 315, 80),
                   FastqSection('HLA-B-seed', 201, 315, 20,
                                (CodonMutation(207, 'TCT'), )))),
        FastqFile('2040A-HLA-B_S6_L001_R2_001.fastq', '2040', True,
                  (FastqSection('HLA-B-seed', 201, 315, 80),
                   FastqSection('HLA-B-seed', 201, 315, 20,
                                (CodonMutation(207, 'TCT'), )))),
        FastqFile(
            '2070A-PR_S9_L001_R1_001.fastq', '2070', False,
            (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )),
             FastqSection('PR', 40, 80, 3,
                          (CodonMutation(45, ''), CodonMutation(64, ''))))),
        FastqFile(
            '2070A-PR_S9_L001_R2_001.fastq', '2070', True,
            (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )),
             FastqSection('PR', 40, 80, 3,
                          (CodonMutation(45, ''), CodonMutation(64, ''))))),
        FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R1_001.fastq',
                  '2100', False,
                  sections_2100hcv_1 + sections_2100v3_1 + sections_2100hiv_1),
        FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R2_001.fastq',
                  '2100', True,
                  sections_2100hcv_2 + sections_2100v3_2 + sections_2100hiv_2),
        FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 1, 66, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 115, 181, 100,
                                (CodonMutation(159, 'GTC'), )))),
        FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 51, 114, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 165, 230, 100))),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 231, 315, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 398, 485, 100))),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 305, 397, 100,
                                (CodonMutation(316, 'AGC'), )),
                   FastqSection('HCV2-JFH-1-NS5b', 470, 561, 100))),
        FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False,
                  (FastqSection('PR', 1, 80, 100,
                                (CodonMutation(24, 'ATA'), )), )),
        FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True,
                  (FastqSection('PR', 20, 99, 100,
                                (CodonMutation(24, 'ATA'), )), )),
        # Simplify with one_contig.
        FastqFile('2160A-HCV_S19_L001_R1_001.fastq', '2160', False,
                  sections_2160_1),
        FastqFile('2160A-HCV_S19_L001_R2_001.fastq', '2160', True,
                  sections_2160_2),
        # Simplify with one_contig.
        FastqFile('2160AMIDI-MidHCV_S20_L001_R1_001.fastq', '2160', False,
                  sections_2160midi_1),
        FastqFile('2160AMIDI-MidHCV_S20_L001_R2_001.fastq', '2160', True,
                  sections_2160midi_2),
        # Simplify with two_long_contigs.
        FastqFile('2170A-HCV_S21_L001_R1_001.fastq', '2170', False,
                  sections_2170_1a_1 + sections_2170_2_1),
        FastqFile('2170A-HCV_S21_L001_R2_001.fastq', '2170', True,
                  sections_2170_1a_2 + sections_2170_2_2),
        FastqFile('2180A-HIV_S22_L001_R1_001.fastq', '2180', False,
                  sections_2180_1),
        FastqFile('2180A-HIV_S22_L001_R2_001.fastq', '2180', True,
                  sections_2180_2),
        FastqFile('2190A-SARSCOV2_S23_L001_R1_001.fastq', '2190', False,
                  (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50,
                                (CodonMutation(4400, 'TCA'), )),
                   FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50,
                                (CodonMutation(4400, 'TCA'), )))),
        FastqFile('2190A-SARSCOV2_S23_L001_R2_001.fastq', '2190', True,
                  (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50,
                                (CodonMutation(4400, 'TCA'), )),
                   FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50,
                                (CodonMutation(4400, 'TCA'), )))),
        FastqFile('2200A-SARSCOV2_S24_L001_R1_001.fastq', '2200', False,
                  (FastqSection('SARS-CoV-2-nsp1', 20, 66, 100), )),
        FastqFile('2200A-SARSCOV2_S24_L001_R2_001.fastq', '2200', True,
                  (FastqSection('SARS-CoV-2-nsp1', 56, 102, 100), )),
        FastqFile('2210A-NFLHIVDNA_S25_L001_R1_001.fastq', '2210', False,
                  sections_2210_1),
        FastqFile('2210A-NFLHIVDNA_S25_L001_R2_001.fastq', '2210', True,
                  sections_2210_2)
    ]
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(
                    projects, section.coord_name, section.start_pos,
                    section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                is_nucleotide = ((ref_start, ref_end) == (section.start_pos,
                                                          section.end_pos))
                for mutation in section.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = mutation.pos - section.start_pos
                        if not is_nucleotide:
                            section_pos *= 3
                        ref_nuc_section[section_pos:section_pos + 3] = list(
                            mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * len(ref_nuc_section)
                file_num = '2' if fastq_file.is_reversed else '1'
                # noinspection PyTypeChecker
                for cluster in range(section.count):
                    f.write(
                        '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'
                        .format(fastq_file.extract_num, cluster + next_cluster,
                                file_num))
                    f.write(ref_nuc_section + '\n')
                    f.write('+\n')
                    f.write(phred_scores + '\n')
                next_cluster += section.count
Beispiel #33
0
def main():
    args = parse_args()
    projects = ProjectConfig.loadDefault()
    for sample_name in args.sample:
        process_file(sample_name, projects, args)
    print('Done.')
Beispiel #34
0
def load_projects() -> ProjectConfig:
    return ProjectConfig.loadDefault()