def setUp(self):
        self.config = ProjectConfig()
        self.defaultJsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1 and R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [1, 3],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
Exemple #2
0
def map_references(contig_ref_name: str, coordinates_name: str,
                   projects: ProjectConfig) -> typing.Mapping[int, int]:
    ref_seq = projects.getReference(contig_ref_name)
    coordinates_seq = projects.getReference(coordinates_name)
    aligned_coordinates, aligned_ref, _ = align_nucs(coordinates_seq, ref_seq)
    mapped_positions = {}
    coordinate_pos = ref_pos = 0
    for coordinate_nuc, ref_nuc in zip(aligned_coordinates, aligned_ref):
        if coordinate_nuc != '-':
            coordinate_pos += 1
        if ref_nuc != '-':
            ref_pos += 1
            mapped_positions[ref_pos] = coordinate_pos
    return mapped_positions
Exemple #3
0
def find_coord_pos(projects: ProjectConfig,
                   coord_name: str,
                   start_pos: int = None,
                   end_pos: int = None):
    coord_seq = projects.getReference(coord_name)
    if start_pos is None:
        start_pos = 1
    if end_pos is None:
        end_pos = len(coord_seq) + 1
    if projects.config['regions'][coord_name]['is_nucleotide']:
        # Already have a nucleotide sequence, nothing to do.
        return coord_name, start_pos, end_pos
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
Exemple #4
0
    def __init__(self,
                 file=None,
                 rules_yaml=None,
                 genotype=None,
                 references=None):
        """ Load ASI rules from a file or file object. """

        if references is None:
            projects = ProjectConfig.loadDefault()
            references = projects.getAllReferences()
            with WILD_TYPES_PATH.open() as wild_types_file:
                wild_types = safe_load(wild_types_file)
            references.update(wild_types)
        self.stds = {
            name if name != 'INT' else 'IN': ref
            for name, ref in references.items()}

        # Algorithm info
        self.alg_version = ''
        self.alg_name = ''

        # definitions
        self.gene_def = {}  # {code: [drug_class_code]}
        self.level_def = {}  # {'1': 'Susceptible'}
        self.drug_class = defaultdict(list)  # {code: [drug_code]}
        self.global_range = []  # [ ['-INF', '9', '1'] , ...]  #first two are the range, the third one is the res level
        self.comment_def = {}  # {code: comment_text}

        self.drugs = {}  # {code: (name, [condition, [(action_type, action_value)]])}
        self.mutation_comments = []  # maybe skip for now?  We don't really use this atm.

        if file is not None:
            self.load_xml(file)
        elif rules_yaml is not None:
            self.load_yaml(rules_yaml, genotype)
Exemple #5
0
 def build_config(project, sequences):
     projects = ProjectConfig()
     projects.config = {
         'projects': {
             project: {
                 'regions': [
                     {
                         'seed_region_names': list(sequences.keys())
                     }
                 ]
             }
         },
         'regions': {name: {'reference': [sequence]}
                     for name, sequence in sequences.items()}
     }
     return projects
Exemple #6
0
def extract_v3loop_ref():
    ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt')
    try:
        with open(ref_filename) as f:
            v3loop_ref = f.read()
    except FileNotFoundError:
        project_config = ProjectConfig.loadDefault()
        hiv_seed = project_config.getReference(G2P_SEED_NAME)
        coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
        v3loop_ref = extract_target(hiv_seed, coordinate_ref)
        with open(ref_filename, 'w') as f:
            f.write(v3loop_ref)
    return v3loop_ref
Exemple #7
0
def extract_v3loop_ref():
    ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt')
    try:
        with open(ref_filename) as f:
            v3loop_ref = f.read()
    except FileNotFoundError:
        project_config = ProjectConfig.loadDefault()
        hiv_seed = project_config.getReference(G2P_SEED_NAME)
        coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
        v3loop_ref = extract_target(hiv_seed, coordinate_ref)
        with open(ref_filename, 'w') as f:
            f.write(v3loop_ref)
    return v3loop_ref
Exemple #8
0
def load_references():
    projects = ProjectConfig.loadDefault()
    references = {}  # {(genotype, region): Reference}
    for ref_name, sequence in projects.getAllReferences().items():
        match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name)
        if match:
            genotype = match.group(1)
            region = match.group(2)
            if region in HCV_REGIONS:
                reference = Reference(ref_name, sequence)
                references[(genotype, region)] = reference
                if genotype == '6':
                    references[('6E', region)] = reference
    return references
Exemple #9
0
 def setUp(self):
     self.projects = ProjectConfig()
     self.projects.load(
         StringIO("""\
         {
           "regions": {
             "R1-seed": {
               "seed_group": "main",
               "reference": ["ACTAAAGGG"]
             },
             "R2-seed": {
               "seed_group": "main",
               "reference": ["ACTAAAGGGAAA"]
             }
           }
         }
         """))
     self.sam_file = StringIO()
     self.remap_counts = StringIO()
     self.remap_counts_writer = DictWriter(
         self.remap_counts, ['type', 'filtered_count', 'count'],
         lineterminator=os.linesep)
     self.remap_counts_writer.writeheader()
Exemple #10
0
def load_references():
    projects = ProjectConfig.loadDefault()
    references = {}  # {(genotype, region): Reference}
    for ref_name, sequence in projects.getAllReferences().items():
        match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name)
        if match:
            genotype = match.group(1)
            region = match.group(2)
            if region in HCV_REGIONS:
                reference = Reference(ref_name, sequence)
                references[(genotype, region)] = reference
                if genotype == '6':
                    references[('6E', region)] = reference
    return references
Exemple #11
0
def main():
    fastq_files = [
        FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)),
                  (CodonMutation(159, 'GTC'), )),
        FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)),
                  (CodonMutation(159, 'GTC'), )),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)),
                  (CodonMutation(316, 'AGC'), )),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)),
                  (CodonMutation(316, 'AGC'), ))
    ]
    projects = ProjectConfig.loadDefault()
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(
                    projects, section.coord_name, section.start_pos,
                    section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                for mutation in fastq_file.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = (mutation.pos - section.start_pos) * 3
                        ref_nuc_section[section_pos:section_pos + 3] = list(
                            mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * (ref_end - ref_start)
                file_num = '2' if fastq_file.is_reversed else '1'
                for cluster in range(section.count):
                    f.write(
                        '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'
                        .format(fastq_file.extract_num, cluster + next_cluster,
                                file_num))
                    f.write(ref_nuc_section + '\n')
                    f.write('+\n')
                    f.write(phred_scores + '\n')
                next_cluster += section.count
Exemple #12
0
    def setUp(self):
        self.defaultJsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()
Exemple #13
0
def read_contigs(contigs_csv, excluded_seeds=None):
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    contig_groups = defaultdict(
        list)  # {group_ref_name: [seq, index, index...]}
    conseqs = {}
    projects = ProjectConfig.loadDefault()
    with contigs_csv:
        contigs_reader = DictReader(contigs_csv)
        for i, row in reversed(list(enumerate(contigs_reader, 1))):
            contig_seq = row['contig']
            match_fraction = float(row['match'])
            is_match = 0.25 <= match_fraction
            is_reversed = match_fraction < 0
            if not (ARE_CONTIGS_MERGED and is_match):
                contig_name = get_contig_name(i, row['ref'], is_match,
                                              is_reversed, excluded_seeds)
                conseqs[contig_name] = contig_seq
                continue
            group_ref_name = row['group_ref']
            contig_group = contig_groups[group_ref_name]
            if not contig_group:
                contig_group.append(projects.getReference(group_ref_name))
            contig_group.append(str(i))
            group_seq = contig_group[0]
            agroup, acontig, score = align_it(group_seq, contig_seq,
                                              gap_open_penalty,
                                              gap_extend_penalty,
                                              use_terminal_gap_penalty)
            match = re.match('-*([^-](.*[^-])?)', acontig)
            start = match.start(1)
            end = match.end(1)
            merged_seq = agroup[:start] + contig_seq + agroup[end:]
            left_trim = len(agroup) - len(agroup.lstrip('-'))
            right_trim = len(agroup) - len(agroup.rstrip('-'))
            contig_group[0] = merged_seq[left_trim:-right_trim or None]

    is_match = True
    is_reversed = False
    for group_ref_name, contig_group in contig_groups.items():
        (group_seq, *contig_nums) = contig_group
        prefix = '_'.join(reversed(contig_nums))
        contig_name = get_contig_name(prefix, group_ref_name, is_match,
                                      is_reversed, excluded_seeds)
        conseqs[contig_name] = group_seq
    return conseqs
Exemple #14
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0,
              merged_contigs_csv=None):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    consensus_builder = ConsensusBuilder()
    counted_reads = consensus_builder.build(merged_reads)
    trimmed_reads = trim_reads(counted_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
    if merged_contigs_csv is not None:
        contig_writer = DictWriter(merged_contigs_csv, ['contig'])
        contig_writer.writeheader()
        for consensus in consensus_builder.get_consensus_by_lengths():
            unambiguous_consensus = consensus.replace('N', '').replace('-', '')
            if unambiguous_consensus:
                contig_writer.writerow(dict(contig=consensus))
Exemple #15
0
def test_duplicated_sars_base_amino(sequence_report):
    """ Special case for duplicated base in SARS orf1ab.

    Expect amino sequence AQSFLNRVCG.
    """

    # refname,qcut,rank,count,offset,seq
    aligned_reads = prepare_reads("""\
SARS-CoV-2-seed,15,0,9,0,GCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACAC
""")
    # Repeat is here:                     ^

    #                                       A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,...,coverage
    expected_text = """\
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,1,4396,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,4,4397,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,7,4398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,10,4399,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,13,4400,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,16,4401,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,18,4402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,4403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,4404,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,4405,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9"""
    sequence_report.projects = ProjectConfig.loadDefault()
    orf1ab_size = len(
        sequence_report.projects.getReference('SARS-CoV-2-ORF1ab'))
    nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12'))

    report_file = StringIO()
    sequence_report.write_amino_header(report_file)
    sequence_report.read(aligned_reads)
    sequence_report.write_amino_counts()

    report = report_file.getvalue()
    report_lines = report.splitlines()
    expected_size = orf1ab_size + nsp12_size + 1
    if len(report_lines) != expected_size:
        assert (len(report_lines), report) == (expected_size, '')

    key_lines = report_lines[4396:4406]
    key_report = '\n'.join(key_lines)
    assert key_report == expected_text
Exemple #16
0
def main():
    project_config = ProjectConfig.loadDefault()
    error_count = 0
    unchecked_ref_names = set(project_config.getAllReferences().keys())
    error_count += check_hcv_seeds(project_config, unchecked_ref_names)
    error_count += check_hcv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_seeds(project_config, unchecked_ref_names)
    error_count += check_hiv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_wild_types(project_config)
    error_count += check_hla_seeds(project_config, unchecked_ref_names)
    error_count += check_hla_coordinates(project_config, unchecked_ref_names)

    if not unchecked_ref_names:
        print('No unchecked refs.')
    else:
        print(fill_report(f'Unchecked refs: '
                          f'{", ".join(sorted(unchecked_ref_names))}'))
        error_count += len(unchecked_ref_names)
    print(f'Total errors: {error_count}.')
Exemple #17
0
def test_duplicated_sars_base_nuc(sequence_report):
    """ Make sure duplicated base in SARS isn't duplicated in nuc.csv. """

    # refname,qcut,rank,count,offset,seq
    aligned_reads = prepare_reads("""\
SARS-CoV-2-seed,15,0,9,10,ACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCG
""")

    #                  A,C,G,T,N,...,coverage
    expected_section = """\
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,9,0,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,0,9,0,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,0,0,9,0,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,0,0,0,9,0,0,0,0,0,9
SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,0,0,0,9,0,0,0,0,0,9"""
    sequence_report.projects = ProjectConfig.loadDefault()
    orf1ab_size = len(
        sequence_report.projects.getReference('SARS-CoV-2-ORF1ab'))
    nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12'))

    report_file = StringIO()
    sequence_report.write_nuc_header(report_file)
    sequence_report.read(aligned_reads)
    sequence_report.write_nuc_counts()

    report = report_file.getvalue()
    report_lines = report.splitlines()
    header_size = 1
    skipped_rows = 2
    expected_size = (orf1ab_size + nsp12_size) * 3 + header_size - skipped_rows
    if len(report_lines) != expected_size:
        assert (len(report_lines), report) == (expected_size, '')

    key_lines = report_lines[13198:13209]
    key_report = '\n'.join(key_lines)
    assert key_report == expected_section
def main():
    # find_best_match_for_pssm()
    sequences = fetch_alignment_sequences(
        2004,
        'CON',  # Consensus/Ancestral
        'ENV')
    consensus = sequences['CON_OF_CONS'].replace('-', '').upper()

    project_config = ProjectConfig.loadDefault()
    ref_names = set(project_config.getAllReferences().keys())
    new_sequences = fetch_alignment_sequences('2015', 'COM')
    consensus_accession = 'Consensus'
    assert consensus_accession not in new_sequences, sorted(
        new_sequences.keys())
    new_sequences[consensus_accession] = consensus

    for line in compare_config('HIV', project_config, new_sequences,
                               ref_names):
        print(line, end='')

    print('Unchecked refs: ' + ', '.join(sorted(ref_names)))
Exemple #19
0
    def __init__(self,
                 file=None,
                 rules_yaml=None,
                 genotype=None,
                 references=None,
                 backup_genotype=None):
        """ Load ASI rules from a file or file object. """

        if references is None:
            projects = ProjectConfig.loadDefault()
            references = projects.getAllReferences()
            with WILD_TYPES_PATH.open() as wild_types_file:
                wild_types = safe_load(wild_types_file)
            references.update(wild_types)
        self.stds = {
            name if name != 'INT' else 'IN': ref
            for name, ref in references.items()
        }

        # Algorithm info
        self.alg_version = ''
        self.alg_name = ''

        # definitions
        self.gene_def = {}  # {code: [drug_class_code]}
        self.level_def = {}  # {'1': 'Susceptible'}
        self.drug_class = defaultdict(list)  # {code: [drug_code]}
        self.global_range = [
        ]  # [ ['-INF', '9', '1'] , ...]  #first two are the range, the third one is the res level
        self.comment_def = {}  # {code: comment_text}

        self.drugs = {
        }  # {code: (name, [condition, [(action_type, action_value)]])}
        self.mutation_comments = [
        ]  # maybe skip for now?  We don't really use this atm.

        if file is not None:
            self.load_xml(file)
        elif rules_yaml is not None:
            self.load_yaml(rules_yaml, genotype, backup_genotype)
Exemple #20
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    trimmed_reads = trim_reads(merged_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts,
                                          aligned_csv,
                                          hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
Exemple #21
0
def write_nuc_mutations(nuc_csv: typing.TextIO,
                        nuc_mutations_csv: typing.TextIO):
    nuc_rows = DictReader(nuc_csv)
    mutations_writer = DictWriter(nuc_mutations_csv,
                                  ['seed',
                                   'region',
                                   'wt',
                                   'refseq_nuc_pos',
                                   'var',
                                   'prevalence'],
                                  lineterminator=os.linesep)
    mutations_writer.writeheader()
    for seed, seed_rows in groupby(nuc_rows, itemgetter('seed')):
        if seed != 'SARS-CoV-2-seed':
            continue
        landmark_reader = LandmarkReader.load()
        projects = ProjectConfig.loadDefault()
        for region_name, region_rows in groupby(seed_rows, itemgetter('region')):
            region = landmark_reader.get_gene(seed, region_name)
            seed_seq = projects.getReference(seed)
            ref_seq = seed_seq[region['start']-1:region['end']]
            for row in region_rows:
                nuc_pos = int(row['refseq.nuc.pos'])
                wild_type = ref_seq[nuc_pos-1]
                coverage = int(row['coverage'])
                if coverage == 0:
                    continue
                for nuc in 'ACGT':
                    if nuc == wild_type:
                        continue
                    nuc_count = int(row[nuc])
                    prevalence = nuc_count / coverage
                    if prevalence >= 0.05:
                        mutations_writer.writerow(dict(seed=seed,
                                                       region=region_name,
                                                       wt=wild_type,
                                                       refseq_nuc_pos=nuc_pos,
                                                       var=nuc,
                                                       prevalence=prevalence))
Exemple #22
0
def main():
    project_config = ProjectConfig.loadDefault()
    error_count = 0
    unchecked_ref_names = set(project_config.getAllReferences().keys())
    error_count += check_hcv_seeds(project_config, unchecked_ref_names)
    error_count += check_hcv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_seeds(project_config, unchecked_ref_names)
    error_count += check_hiv_coordinates(project_config, unchecked_ref_names)
    error_count += check_hiv_wild_types(project_config)
    error_count += check_hla_seeds(project_config, unchecked_ref_names)
    error_count += check_hla_coordinates(project_config, unchecked_ref_names)
    error_count += check_sars_seeds(project_config, unchecked_ref_names)
    error_count += check_sars_coordinates(project_config, unchecked_ref_names)

    if not unchecked_ref_names:
        print('No unchecked refs.')
    else:
        print(
            fill_report(f'Unchecked refs: '
                        f'{", ".join(sorted(unchecked_ref_names))}'))
        error_count += len(unchecked_ref_names)
    print(f'Total errors: {error_count}.')
def load_hcv(seqs):
    hcv_definitions = DictReader(
        StringIO("""\
protocol,name,direction,length,h77_pos,sequence
HCV WG,oligo dA20,R,20,9418-9437,AAAAAAAAAAAAAAAAAAAA
,Pr3,R,30,8616-8645,GGCGGAATTCCTGGTCATAGCCTCCGTGAA
,1abGENF1bp,F,28,266-293,GGGTCGCGAAAGGCCTTGTGGTACTGCC
,TIM-Pr3,R,30,8616-8645,CAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA
,1abGENF2,F,30,286-315,GTACTGCCTGATAGGGTGCTTGCGAGTGCC
,Pr6,R,30,8611-8640,AATTCCTGGTCATAGCCTCCGTGAAGACTC
HCV miDi,Pr1,F,31,8245-8275,TGGGGTTCGCGTATGATACCCGCTGCTTTGA
,Pr2,F,31,8245-8275,TGGGGTTTTCTTACGACACCAGGTGCTTTGA
,oligo dA20-TIM,R,20,9418-9437,CAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA
,Pr4,F,29,8253-8281,CCGTATGATACCCGCTGCTTTGACTCAAC
,Pr5,F,29,8253-8281,TCCTACGACACCAGGTGCTTTGATTCAAC
,TIM,R,,1-0,CAGGAAACAGCTATGAC
"""))
    projects = ProjectConfig.loadDefault()
    h77 = projects.getReference('HCV-1a')
    is_comparing = True
    differ = Differ()
    for row in hcv_definitions:
        name = 'HCV ' + row['name']
        start, end = (int(pos) for pos in row['h77_pos'].split('-'))
        primer = SeqRecord(Seq(row['sequence']), name, description='')
        complement = primer.reverse_complement(id=primer.id, description='')
        direction = row['direction']
        if direction == 'F':
            seqs['left'].append(primer)
        else:
            seqs['right'].append(primer)
            primer, complement = complement, primer
        h77_section = Seq(h77[start - 1:end])
        if is_comparing and primer.seq != h77_section:
            print(name, 'does not match.')
            diffs = differ.compare([str(primer.seq) + '\n'],
                                   [str(h77_section) + '\n'])
            print(*diffs, sep='')
Exemple #24
0
 def setUp(self):
     self.projects = ProjectConfig()
     self.projects.load(StringIO("""\
         {
           "regions": {
             "R1-seed": {
               "seed_group": "main",
               "reference": ["ACTAAAGGG"]
             },
             "R2-seed": {
               "seed_group": "main",
               "reference": ["ACTAAAGGGAAA"]
             }
           }
         }
         """))
     self.sam_file = StringIO()
     self.remap_counts = StringIO()
     self.remap_counts_writer = DictWriter(
         self.remap_counts,
         ['type', 'filtered_count', 'count'],
         lineterminator=os.linesep)
     self.remap_counts_writer.writeheader()
Exemple #25
0
def fastq_g2p(pssm,
              fastq1,
              fastq2,
              g2p_csv,
              g2p_summary_csv=None,
              unmapped1=None,
              unmapped2=None,
              aligned_csv=None,
              min_count=1,
              min_valid=1,
              min_valid_percent=0.0):
    g2p_filename = getattr(g2p_csv, 'name', None)
    if g2p_filename is None:
        count_prefix = None
    else:
        working_path = os.path.dirname(g2p_csv.name)
        count_prefix = os.path.join(working_path, 'read_counts')
    project_config = ProjectConfig.loadDefault()
    hiv_seed = project_config.getReference(G2P_SEED_NAME)
    coordinate_ref = project_config.getReference(COORDINATE_REF_NAME)
    v3loop_ref = extract_target(hiv_seed, coordinate_ref)
    reader = FastqReader(fastq1, fastq2)
    merged_reads = merge_reads(reader)
    trimmed_reads = trim_reads(merged_reads, v3loop_ref)
    mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2)
    read_counts = count_reads(mapped_reads, count_prefix)
    if aligned_csv is not None:
        read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed,
                                          v3loop_ref)

    write_rows(pssm,
               read_counts,
               g2p_csv,
               g2p_summary_csv,
               min_count,
               min_valid=min_valid,
               min_valid_percent=min_valid_percent)
Exemple #26
0
    def setUp(self):
        self.defaultJsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()
Exemple #27
0
class ProjectConfigurationTest(unittest.TestCase):
    def setUp(self):
        self.defaultJsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()

    def testConvert(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testSharedRegions(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1 and R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
>R2-seed
TTT
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testUnusedRegion(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testExcludeSeeds(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    },
    "R3": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R3-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    },
    "R3-seed": {
      "is_nucleotide": true,
      "reference": [
        "TAG"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R2-seed
TTT
"""
        fasta = StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta,
                                   excluded_seeds=['R1-seed', 'R3-seed'])

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testExcludeUnknownSeed(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta, excluded_seeds=['R99-seed'])

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testDuplicateReference(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1a-seed", "R1b-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1a-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    },
    "R1b-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    }
  }
}
""")
        fasta = StringIO()
        self.config.load(jsonIO)

        self.assertRaisesRegex(RuntimeError,
                               "Duplicate references: R1a-seed and R1b-seed.",
                               self.config.writeSeedFasta, fasta)

    def testGetReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_ref = 'ACTGAAAGGG'

        seed_ref = self.config.getReference(seed_name)

        self.assertSequenceEqual(expected_ref, seed_ref)

    def testGetCoordinateReferences(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_refs = {'R1': 'RWNNWR'}

        coordinate_refs = self.config.getCoordinateReferences(seed_name)

        self.assertDictEqual(expected_refs, coordinate_refs)

    def testGetAllReferences(self):
        expected_references = {'R1-seed': 'ACTGAAAGGG', 'R1': 'RWNNWR'}

        self.config.load(self.defaultJsonIO)
        references = self.config.getAllReferences()

        self.assertEqual(expected_references, references)

    def testUnknownReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R-unknown'

        self.assertRaises(KeyError, self.config.getReference, seed_name)

    def testMaxVariants(self):
        self.config.load(self.defaultJsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsUnusedRegion(self):
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R2'

        self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsTwoProjects(self):
        """ If two projects specify a maximum for the same coordinate region,
        use the bigger of the two.
        """
        jsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 9,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": "R2",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name))

    def testReload(self):
        jsonIO1 = StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    }
  }
}
""")
        jsonIO2 = StringIO("""\
{
  "projects": {
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "GACCTA"
      ]
    }
  }
}
""")

        self.config.load(jsonIO1)
        self.config.load(jsonIO2)

        self.assertRaises(KeyError, self.config.getReference, "R1-seed")
        self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed"))

    def testProjectSeeds(self):
        expected_seeds = set(['R1-seed'])

        self.config.load(self.defaultJsonIO)
        seeds = self.config.getProjectSeeds('R1')

        self.assertSetEqual(expected_seeds, seeds)

    def testSeedGroup(self):
        expected_group = "R1-seeds"

        self.config.load(self.defaultJsonIO)
        group = self.config.getSeedGroup('R1-seed')

        self.assertEqual(expected_group, group)
Exemple #28
0
class ProjectConfigurationProjectRegionsTest(unittest.TestCase):
    def setUp(self):
        self.config = ProjectConfig()
        self.defaultJsonIO = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1 and R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [1, 3],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")

    def testProjectRegions(self):
        expected_project_regions = [{
            "project_name": "R1",
            "coordinate_region_length": 3,
            "key_positions": [],
            "min_coverage1": 10,
            "min_coverage2": 50,
            "min_coverage3": 100
        }, {
            "project_name": "R1 and R2",
            "coordinate_region_length": 3,
            "key_positions": [1, 3],
            "min_coverage1": 10,
            "min_coverage2": 50,
            "min_coverage3": 100
        }]

        self.config.load(self.defaultJsonIO)
        project_regions = list(self.config.getProjectRegions('R1-seed', 'R1'))

        self.assertEqual(expected_project_regions, project_regions)

    def testProjectExcluded(self):
        excluded_projects = ['R1']
        expected_project_regions = [{
            "project_name": "R1 and R2",
            "coordinate_region_length": 3,
            "key_positions": [1, 3],
            "min_coverage1": 10,
            "min_coverage2": 50,
            "min_coverage3": 100
        }]

        self.config.load(self.defaultJsonIO)
        project_regions = list(
            self.config.getProjectRegions('R1-seed', 'R1', excluded_projects))

        self.assertEqual(expected_project_regions, project_regions)
Exemple #29
0
    def setUp(self):
        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
        config_json = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [
            {
              "end_pos": null,
              "start_pos": 1
            },
            {
              "end_pos": null,
              "start_pos": 3
            }
          ],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        self.config = ProjectConfig()
        self.config.load(config_json)
Exemple #30
0
def aln2counts(aligned_csv,
               nuc_csv,
               amino_csv,
               coord_ins_csv,
               conseq_csv,
               failed_align_csv,
               callback=None,
               coverage_summary_csv=None,
               clipping_csv=None,
               conseq_ins_csv=None,
               g2p_aligned_csv=None,
               remap_conseq_csv=None,
               conseq_region_csv=None):
    """
    Analyze aligned reads for nucleotide and amino acid frequencies.
    Generate consensus sequences.
    @param aligned_csv:         Open file handle containing aligned reads (from sam2aln)
    @param nuc_csv:             Open file handle to write nucleotide frequencies.
    @param amino_csv:           Open file handle to write amino acid frequencies.
    @param coord_ins_csv:       Open file handle to write insertions relative to coordinate reference.
    @param conseq_csv:          Open file handle to write consensus sequences.
    @param failed_align_csv:    Open file handle to write sample consensus sequences that failed to
                                align to the coordinate reference.
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param coverage_summary_csv: Open file handle to write coverage depth.
    @param clipping_csv: Open file handle containing soft clipping counts
    @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence
    @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p)
    @param remap_conseq_csv: Open file handle containing consensus sequences
        from the remap step.
    @param conseq_region_csv: Open file handle to write consensus sequences
        split into regions.
    """
    # load project information
    projects = ProjectConfig.loadDefault()

    # initialize reporter classes
    with InsertionWriter(coord_ins_csv) as insert_writer:
        report = SequenceReport(insert_writer,
                                projects,
                                CONSEQ_MIXTURE_CUTOFFS)
        report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
        report.write_amino_header(amino_csv)
        report.write_consensus_header(conseq_csv)
        report.write_consensus_regions_header(conseq_region_csv)
        report.write_failure_header(failed_align_csv)
        report.write_nuc_header(nuc_csv)
        if coverage_summary_csv is None:
            coverage_summary = coverage_writer = None
        else:
            coverage_writer = csv.DictWriter(coverage_summary_csv,
                                             ['avg_coverage',
                                              'coverage_region',
                                              'region_width'],
                                             lineterminator=os.linesep)
            coverage_writer.writeheader()
            coverage_summary = {}

        if callback:
            aligned_filename = getattr(aligned_csv, 'name', None)
            if aligned_filename:
                file_size = os.stat(aligned_filename).st_size
                report.enable_callback(callback, file_size)

        if clipping_csv is not None:
            report.read_clipping(clipping_csv)
        if conseq_ins_csv is not None:
            report.read_insertions(conseq_ins_csv)
        if remap_conseq_csv is not None:
            report.read_remap_conseqs(remap_conseq_csv)

        report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary)

        if coverage_summary_csv is not None:
            if coverage_summary:
                coverage_writer.writerow(coverage_summary)
Exemple #31
0
class ProjectConfigurationTest(unittest.TestCase):
    def setUp(self):
        self.defaultJsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 5,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"],
          "id": 10042
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ],
      "seed_group": "R1-seeds"
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "RWN",
        "NWR"
      ],
      "seed_group": null
    }
  }
}
""")
        self.config = ProjectConfig()

    def testConvert(self):
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO.StringIO()

        self.config.load(self.defaultJsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testSharedRegions(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1 and R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
>R2-seed
TTT
"""
        fasta = StringIO.StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testUnusedRegion(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "TTT"
      ]
    }
  }
}
""")
        expected_fasta = """\
>R1-seed
ACTGAAAGGG
"""
        fasta = StringIO.StringIO()

        self.config.load(jsonIO)
        self.config.writeSeedFasta(fasta)

        self.assertMultiLineEqual(expected_fasta, fasta.getvalue())

    def testDuplicateReference(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1a-seed", "R1b-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1a-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    },
    "R1b-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTAAAGGG"
      ]
    }
  }
}
""")
        fasta = StringIO.StringIO()
        self.config.load(jsonIO)

        self.assertRaisesRegexp(RuntimeError,
                                "Duplicate references: R1a-seed and R1b-seed.",
                                self.config.writeSeedFasta,
                                fasta)

    def testGetReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_ref = 'ACTGAAAGGG'

        seed_ref = self.config.getReference(seed_name)

        self.assertSequenceEqual(expected_ref, seed_ref)

    def testGetCoordinateReferences(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R1-seed'
        expected_refs = {'R1': 'RWNNWR'}

        coordinate_refs = self.config.getCoordinateReferences(seed_name)

        self.assertDictEqual(expected_refs, coordinate_refs)

    def testUnknownReference(self):
        self.config.load(self.defaultJsonIO)
        seed_name = 'R-unknown'

        self.assertRaises(KeyError, self.config.getReference, seed_name)

    def testMaxVariants(self):
        self.config.load(self.defaultJsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsUnusedRegion(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R2'

        self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name))

    def testMaxVariantsTwoProjects(self):
        """ If two projects specify a maximum for the same coordinate region,
        use the bigger of the two.
        """
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 9,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 2,
      "regions": [
        {
          "coordinate_region": "R1",
          "seed_region_names": ["R1-seed"]
        },
        {
          "coordinate_region": "R2",
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    },
    "R1": {
      "is_nucleotide": false,
      "reference": [
        "NSFW"
      ]
    },
    "R2": {
      "is_nucleotide": false,
      "reference": [
        "RSW"
      ]
    }
  }
}
""")
        self.config.load(jsonIO)
        coordinate_region_name = 'R1'

        self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name))

    def testReload(self):
        jsonIO1 = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R1-seed"]
        }
      ]
    }
  },
  "regions": {
    "R1-seed": {
      "is_nucleotide": true,
      "reference": [
        "ACTGAAA",
        "GGG"
      ]
    }
  }
}
""")
        jsonIO2 = StringIO.StringIO("""\
{
  "projects": {
    "R2": {
      "regions": [
        {
          "coordinate_region": null,
          "seed_region_names": ["R2-seed"]
        }
      ]
    }
  },
  "regions": {
    "R2-seed": {
      "is_nucleotide": true,
      "reference": [
        "GACCTA"
      ]
    }
  }
}
""")

        self.config.load(jsonIO1)
        self.config.load(jsonIO2)

        self.assertRaises(KeyError, self.config.getReference, "R1-seed")
        self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed"))

    def testProjectSeeds(self):
        expected_seeds = set(['R1-seed'])

        self.config.load(self.defaultJsonIO)
        seeds = self.config.getProjectSeeds('R1')

        self.assertSetEqual(expected_seeds, seeds)

    def testSeedGroup(self):
        expected_group = "R1-seeds"

        self.config.load(self.defaultJsonIO)
        group = self.config.getSeedGroup('R1-seed')

        self.assertEqual(expected_group, group)

    def testProjectRegions(self):
        jsonIO = StringIO.StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1 and R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [1, 3],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        expected_project_regions = [{"project_name": "R1",
                                     "coordinate_region_length": 3,
                                     "key_positions": [],
                                     "min_coverage1": 10,
                                     "min_coverage2": 50,
                                     "min_coverage3": 100},
                                    {"project_name": "R1 and R2",
                                     "coordinate_region_length": 3,
                                     "key_positions": [1, 3],
                                     "min_coverage1": 10,
                                     "min_coverage2": 50,
                                     "min_coverage3": 100}]

        self.config.load(jsonIO)
        project_regions = list(self.config.getProjectRegions('R1-seed', 'R1'))

        self.assertEqual(expected_project_regions, project_regions)
Exemple #32
0
def main():
    args = parse_args()
    project_config = ProjectConfig.loadDefault()
    scoring_path = Path(__file__).parent.parent / 'project_scoring.json'
    with scoring_path.open() as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(args.qai_server, args.qai_user, args.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     args.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                args.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        # noinspection PyTypeChecker
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config['regions'].items(
        ):
            ref_seq = ''.join(region_data['reference'])
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ref_seq,
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region
            elif region['reference'] != ref_seq:
                print("Reference doesn't match:", region_name)
                if args.update_sequences:
                    region['reference'] = ref_seq
                    session.post_json(f"/lab_miseq_regions/{region['id']}",
                                      region)

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': args.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].items():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print("Done.")
Exemple #33
0
from micall.utils.alignment_wrapper import align_nucs

try:
    # noinspection PyPackageRequirements
    from mappy import Aligner
except ImportError:
    Aligner = None

from micall.utils.fetch_sequences import fetch_by_accession


import sys
from micall.core.project_config import ProjectConfig

REFERENCE = ProjectConfig.loadDefault()
REFERENCE = REFERENCE.getReference('SARS-CoV-2-seed')

def load_coverage(csv):
    result = {}
    with open(csv) as csvfile:
        reader = DictReader(csvfile)
        for row in reader:
            result[int(row['query_nuc_pos'])] = int(row['coverage'])
    return result

BATCH = 'batch_01'

ROOT = (
    Path('/wow')
    / BATCH
Exemple #34
0
class CoveragePlotsTest(TestCase):
    def setUp(self):
        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
        config_json = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [
            {
              "end_pos": null,
              "start_pos": 1
            },
            {
              "end_pos": null,
              "start_pos": 3
            }
          ],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        self.config = ProjectConfig()
        self.config.load(config_json)

    @patch('matplotlib.pyplot.savefig')
    @patch('micall.core.project_config.ProjectConfig.loadScoring')
    def test_simple(self, config_mock, savefig_mock):
        config_mock.return_value = self.config
        amino_csv = StringIO("""\
seed,region,q-cutoff,query.aa.pos,refseq.aa.pos,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
R1-seed,R1,15,100,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,101,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0
R1-seed,R1,15,102,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0
""")
        expected_scores = """\
project,region,seed,q.cut,min.coverage,which.key.pos,off.score,on.score
R1,R1,R1-seed,15,5,1,-1,1
R1-and-R2,R1,R1-seed,15,5,1,-1,1
"""
        scores_csv = StringIO()
        amino_csv.name = 'E1234.amino.csv'
        expected_calls = [call('E1234.R1.R1.png'),
                          call('E1234.R1-and-R2.R1.png')]

        coverage_plot(amino_csv, coverage_scores_csv=scores_csv)

        self.assertEqual(expected_calls, savefig_mock.mock_calls)
        self.assertEqual(expected_scores, scores_csv.getvalue())
Exemple #35
0
def main():
    fastq_files = [FastqFile('2130A-HCV_S15_L001_R1_001.fastq',
                             '2130',
                             False,
                             (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)),
                             (CodonMutation(159, 'GTC'),)),
                   FastqFile('2130A-HCV_S15_L001_R2_001.fastq',
                             '2130',
                             True,
                             (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)),
                             (CodonMutation(159, 'GTC'),)),
                   FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq',
                             '2130',
                             False,
                             (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)),
                             (CodonMutation(316, 'AGC'),)),
                   FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq',
                             '2130',
                             True,
                             (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100),
                              FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)),
                             (CodonMutation(316, 'AGC'),)),
                   FastqFile('2140A-HIV_S17_L001_R1_001.fastq',
                             '2140',
                             False,
                             (FastqSection('PR', 1, 80, 100),),
                             (CodonMutation(24, 'ATA'),)),
                   FastqFile('2140A-HIV_S17_L001_R2_001.fastq',
                             '2140',
                             True,
                             (FastqSection('PR', 20, 99, 100),),
                             (CodonMutation(24, 'ATA'),))]
    projects = ProjectConfig.loadDefault()
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(projects,
                                                              section.coord_name,
                                                              section.start_pos,
                                                              section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                for mutation in fastq_file.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = (mutation.pos - section.start_pos) * 3
                        ref_nuc_section[section_pos:section_pos+3] = list(mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * (ref_end-ref_start)
                file_num = '2' if fastq_file.is_reversed else '1'
                for cluster in range(section.count):
                    f.write('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'.format(
                        fastq_file.extract_num,
                        cluster + next_cluster,
                        file_num))
                    f.write(ref_nuc_section+'\n')
                    f.write('+\n')
                    f.write(phred_scores+'\n')
                next_cluster += section.count
Exemple #36
0
def aln2counts(aligned_csv,
               nuc_csv,
               amino_csv,
               coord_ins_csv,
               conseq_csv,
               failed_align_csv,
               callback=None,
               coverage_summary_csv=None,
               clipping_csv=None,
               conseq_ins_csv=None,
               g2p_aligned_csv=None,
               remap_conseq_csv=None):
    """
    Analyze aligned reads for nucleotide and amino acid frequencies.
    Generate consensus sequences.
    @param aligned_csv:         Open file handle containing aligned reads (from sam2aln)
    @param nuc_csv:             Open file handle to write nucleotide frequencies.
    @param amino_csv:           Open file handle to write amino acid frequencies.
    @param coord_ins_csv:       Open file handle to write insertions relative to coordinate reference.
    @param conseq_csv:          Open file handle to write consensus sequences.
    @param failed_align_csv:    Open file handle to write sample consensus sequences that failed to
                                align to the coordinate reference.
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param coverage_summary_csv: Open file handle to write coverage depth.
    @param clipping_csv: Open file handle containing soft clipping counts
    @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence
    @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p)
    @param remap_conseq_csv: Open file handle containing consensus sequences
        from the remap step.
    """
    # load project information
    projects = ProjectConfig.loadDefault()

    # initialize reporter classes
    insert_writer = InsertionWriter(coord_ins_csv)
    report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS)
    report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE
    report.write_amino_header(amino_csv)
    report.write_consensus_header(conseq_csv)
    report.write_failure_header(failed_align_csv)
    report.write_nuc_header(nuc_csv)
    if coverage_summary_csv is None:
        coverage_summary = coverage_writer = None
    else:
        coverage_writer = csv.DictWriter(
            coverage_summary_csv,
            ['avg_coverage', 'coverage_region', 'region_width'],
            lineterminator=os.linesep)
        coverage_writer.writeheader()
        coverage_summary = {}

    if callback:
        aligned_filename = getattr(aligned_csv, 'name', None)
        if aligned_filename:
            file_size = os.stat(aligned_filename).st_size
            report.enable_callback(callback, file_size)

    if clipping_csv is not None:
        report.read_clipping(clipping_csv)
    if conseq_ins_csv is not None:
        report.read_insertions(conseq_ins_csv)
    if remap_conseq_csv is not None:
        report.read_remap_conseqs(remap_conseq_csv)

    report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary)

    if coverage_summary_csv is not None:
        if coverage_summary:
            coverage_writer.writerow(coverage_summary)
Exemple #37
0
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions):
    """
    Parses a Pipeline-produced conseq file and builds JSON objects to send
    to QAI.

    @param conseqs_file: An open file that contains the consensus sequences
        from the counts2csf step for all samples in the run.
    @param run: a hash with the attributes of the run record, including a
        sequencing summary of all the samples and their target projects
    @param sample_sheet: The data parsed from the sample sheet.
    @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that
        were given a good score by the pipeline.
    @return an array of JSON hashes, one for each conseq.
    """

    result = []
    ss = sample_sheet
    sequencings = run['sequencing_summary']
    conseqs_csv = csv.DictReader(conseqs_file)
    # ss["Data"] is keyed by (what should be) the FASTQ
    # filename, which looks like
    #
    # [sample name with ; and _ replaced by -]_S[sample number].
    #
    # Meanwhile, entries in conseqs_file have a "sample" field holding
    # just the sample name (also with ; and _ replaced).  We make a
    # lookup table to get the FASTQ filename just from the first part.
    # This will make subsequent steps easier (avoids having to do a
    # search through a list/dict of dicts).
    # FASTQ_lookup = {}
    # filename_re = re.compile("(.+)_S.+")
    # for fastq_filename in ss["Data"]:
    #     sample_name = filename_re.match(fastq_filename).group(1)
    #     FASTQ_lookup[sample_name] = fastq_filename

    projects = ProjectConfig.loadDefault()
    target_regions = set()  # set([(tags, seed_name)])
    for entry in sequencings:
        try:
            seeds = projects.getProjectSeeds(entry['target_project'])
        except KeyError:
            logger.warning('Failed to load project seeds.', exc_info=True)
            seeds = set()
        for seed in seeds:
            target_regions.add((entry['tag'], seed))

    for row in conseqs_csv:
        # Each row of this file looks like:
        # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence
        # We want to take the "sample" entry and get the corresponding
        # original Sample_Name from the sample sheet. In version 2, this
        # looks like [sample name]~[project name]#[...]
        # In version 1, this looked like [sample name]~[project name]#[...]
        # but both ; and _ got garbled by the MiSeq instrument itself.
        # Thus we have to work around it.
        fastq_filename = row["sample"]
        sample_info = ss["Data"][fastq_filename]
        orig_sample_name = sample_info["orig_sample_name"]
        sample_tags = sample_info["tags"]
        # FIXME if row["sequence"] is blank we replace it with a dash.
        # Need Conan to make that row blank-able.
        curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-"
        sample_region = (fastq_filename, row["region"], row["q-cutoff"])
        ok_region = sample_region in ok_sample_regions
        is_target_region = (sample_tags, row["region"]) in target_regions
        ok_for_release = ok_region and is_target_region
        result.append({
            "samplename": orig_sample_name,
            # July 9, 2014: we can't do this properly right now
            # without a lookup table that is yet to be fully
            # defined.
            "testcode": None,
            "conseq_cutoff": row["consensus-percent-cutoff"],
            "region": row["region"],
            "qcutoff": float(row["q-cutoff"]),
            "snum": fastq_filename.split('_')[-1],
            "seq": curr_seq,
            "ok_for_release": ok_for_release
        })
    return result
Exemple #38
0
def main():
    project_config = ProjectConfig.loadDefault()
    with open('../project_scoring.json', 'rU') as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(settings.qai_path, settings.qai_user,
                      settings.qai_password)

        pipelines = session.get_json("/lab_miseq_pipelines?version=" +
                                     settings.pipeline_version,
                                     retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                settings.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config[
                'regions'].iteritems():
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions", {
                        'name': region_name,
                        'is_nucleotide': region_data['is_nucleotide'],
                        'reference': ''.join(region_data['reference']),
                        'seed_group_id': seed_group_id
                    })
                regions[region_name] = region

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': settings.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(
            ((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config[
                'projects'].iteritems():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects", {
                        'name': project_name,
                        'max_variants': project_data['max_variants']
                    })
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {
                                                    'pipeline_id': pipeline_id,
                                                    'project_id': project['id']
                                                })
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name][
                    'regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions", {
                        'project_version_id': project_version['id'],
                        'coordinate_region_id': coordinate_region['id'],
                        'min_coverage1': scoring_data['min_coverage1'],
                        'min_coverage2': scoring_data['min_coverage2'],
                        'min_coverage3': scoring_data['min_coverage3'],
                        'seed_group_id': seed_group_id
                    })

                for key_position in scoring_data['key_positions']:
                    session.post_json(
                        "/lab_miseq_key_positions", {
                            'project_region_id': project_region['id'],
                            'start_pos': key_position['start_pos'],
                            'end_pos': key_position['end_pos']
                        })

    print "Done."
Exemple #39
0
def main():
    projects = ProjectConfig.loadDefault()
    sections_2100hcv_1, sections_2100hcv_2 = make_random_sections(
        'HCV1A-H77-NS5a', 1, 300, projects, 400)
    sections_2100v3_1, sections_2100v3_2 = ([
        FastqSection('HIV1-B-FR-K03455-seed', 7056, 7312, 50),
        FastqSection('HIV1-B-FR-K03455-seed', 7062, 7312, 50)
    ], [
        FastqSection('HIV1-B-FR-K03455-seed', 7123, 7373, 50),
        FastqSection('HIV1-B-FR-K03455-seed', 7123, 7376, 50)
    ])
    sections_2100hiv_1, sections_2100hiv_2 = make_random_sections(
        'RT', 1, 300, projects, 400)
    sections_2160_1, sections_2160_2 = make_random_sections(
        'HCV2-JFH-1-NS5b',
        1,
        230,
        projects,
        mutations=(CodonMutation(159, 'GTC'), ))
    sections_2160midi_1, sections_2160midi_2 = make_random_sections(
        'HCV2-JFH-1-NS5b',
        231,
        561,
        projects,
        mutations=(CodonMutation(316, 'AGC'), ))
    sections_2170_1a_1, sections_2170_1a_2 = make_random_sections(
        'HCV-1a', 6258, 9375)
    sections_2170_2_1, sections_2170_2_2 = make_random_sections(
        'HCV-2a', 6269, 9440)
    sections_2180_1, sections_2180_2 = make_random_sections(
        'HIV1-B-FR-K03455-seed', 6225, 7757)
    hxb2_ref = projects.getReference('HIV1-B-FR-K03455-seed')

    projects.config['regions']['HXB2-with-deletion'] = dict(
        reference=hxb2_ref[617:928] + hxb2_ref[9358:9652],
        is_nucleotide=True,
        seed_group=None)
    sections_2210_1, sections_2210_2 = make_random_sections(
        'HXB2-with-deletion', projects=projects)
    fastq_files = [
        FastqFile('2010A-V3LOOP_S3_L001_R1_001.fastq', '2010', False,
                  (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10),
                   FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))),
        FastqFile('2010A-V3LOOP_S3_L001_R2_001.fastq', '2010', True,
                  (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10),
                   FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))),
        FastqFile('2020A-GP41_S4_L001_R1_001.fastq', '2020', False,
                  (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10,
                                (CodonMutation(6981, 'GGGATA'), )), )),
        FastqFile('2020A-GP41_S4_L001_R2_001.fastq', '2020', True,
                  (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10,
                                (CodonMutation(6981, 'GGGATA'), )), )),
        FastqFile('2040A-HLA-B_S6_L001_R1_001.fastq', '2040', False,
                  (FastqSection('HLA-B-seed', 201, 315, 80),
                   FastqSection('HLA-B-seed', 201, 315, 20,
                                (CodonMutation(207, 'TCT'), )))),
        FastqFile('2040A-HLA-B_S6_L001_R2_001.fastq', '2040', True,
                  (FastqSection('HLA-B-seed', 201, 315, 80),
                   FastqSection('HLA-B-seed', 201, 315, 20,
                                (CodonMutation(207, 'TCT'), )))),
        FastqFile(
            '2070A-PR_S9_L001_R1_001.fastq', '2070', False,
            (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )),
             FastqSection('PR', 40, 80, 3,
                          (CodonMutation(45, ''), CodonMutation(64, ''))))),
        FastqFile(
            '2070A-PR_S9_L001_R2_001.fastq', '2070', True,
            (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )),
             FastqSection('PR', 40, 80, 3,
                          (CodonMutation(45, ''), CodonMutation(64, ''))))),
        FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R1_001.fastq',
                  '2100', False,
                  sections_2100hcv_1 + sections_2100v3_1 + sections_2100hiv_1),
        FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R2_001.fastq',
                  '2100', True,
                  sections_2100hcv_2 + sections_2100v3_2 + sections_2100hiv_2),
        FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 1, 66, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 115, 181, 100,
                                (CodonMutation(159, 'GTC'), )))),
        FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 51, 114, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 165, 230, 100))),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False,
                  (FastqSection('HCV2-JFH-1-NS5b', 231, 315, 100),
                   FastqSection('HCV2-JFH-1-NS5b', 398, 485, 100))),
        FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True,
                  (FastqSection('HCV2-JFH-1-NS5b', 305, 397, 100,
                                (CodonMutation(316, 'AGC'), )),
                   FastqSection('HCV2-JFH-1-NS5b', 470, 561, 100))),
        FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False,
                  (FastqSection('PR', 1, 80, 100,
                                (CodonMutation(24, 'ATA'), )), )),
        FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True,
                  (FastqSection('PR', 20, 99, 100,
                                (CodonMutation(24, 'ATA'), )), )),
        # Simplify with one_contig.
        FastqFile('2160A-HCV_S19_L001_R1_001.fastq', '2160', False,
                  sections_2160_1),
        FastqFile('2160A-HCV_S19_L001_R2_001.fastq', '2160', True,
                  sections_2160_2),
        # Simplify with one_contig.
        FastqFile('2160AMIDI-MidHCV_S20_L001_R1_001.fastq', '2160', False,
                  sections_2160midi_1),
        FastqFile('2160AMIDI-MidHCV_S20_L001_R2_001.fastq', '2160', True,
                  sections_2160midi_2),
        # Simplify with two_long_contigs.
        FastqFile('2170A-HCV_S21_L001_R1_001.fastq', '2170', False,
                  sections_2170_1a_1 + sections_2170_2_1),
        FastqFile('2170A-HCV_S21_L001_R2_001.fastq', '2170', True,
                  sections_2170_1a_2 + sections_2170_2_2),
        FastqFile('2180A-HIV_S22_L001_R1_001.fastq', '2180', False,
                  sections_2180_1),
        FastqFile('2180A-HIV_S22_L001_R2_001.fastq', '2180', True,
                  sections_2180_2),
        FastqFile('2190A-SARSCOV2_S23_L001_R1_001.fastq', '2190', False,
                  (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50,
                                (CodonMutation(4400, 'TCA'), )),
                   FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50,
                                (CodonMutation(4400, 'TCA'), )))),
        FastqFile('2190A-SARSCOV2_S23_L001_R2_001.fastq', '2190', True,
                  (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50,
                                (CodonMutation(4400, 'TCA'), )),
                   FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50,
                                (CodonMutation(4400, 'TCA'), )))),
        FastqFile('2200A-SARSCOV2_S24_L001_R1_001.fastq', '2200', False,
                  (FastqSection('SARS-CoV-2-nsp1', 20, 66, 100), )),
        FastqFile('2200A-SARSCOV2_S24_L001_R2_001.fastq', '2200', True,
                  (FastqSection('SARS-CoV-2-nsp1', 56, 102, 100), )),
        FastqFile('2210A-NFLHIVDNA_S25_L001_R1_001.fastq', '2210', False,
                  sections_2210_1),
        FastqFile('2210A-NFLHIVDNA_S25_L001_R2_001.fastq', '2210', True,
                  sections_2210_2)
    ]
    for fastq_file in fastq_files:
        with open(fastq_file.name, 'w') as f:
            next_cluster = 1
            for section in fastq_file.sections:
                ref_name, ref_start, ref_end = find_coord_pos(
                    projects, section.coord_name, section.start_pos,
                    section.end_pos)

                ref_nuc_seq = projects.getReference(ref_name)
                ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end])
                is_nucleotide = ((ref_start, ref_end) == (section.start_pos,
                                                          section.end_pos))
                for mutation in section.mutations:
                    if section.start_pos <= mutation.pos <= section.end_pos:
                        section_pos = mutation.pos - section.start_pos
                        if not is_nucleotide:
                            section_pos *= 3
                        ref_nuc_section[section_pos:section_pos + 3] = list(
                            mutation.codon)
                ref_nuc_section = ''.join(ref_nuc_section)
                if fastq_file.is_reversed:
                    ref_nuc_section = reverse_and_complement(ref_nuc_section)
                phred_scores = 'A' * len(ref_nuc_section)
                file_num = '2' if fastq_file.is_reversed else '1'
                # noinspection PyTypeChecker
                for cluster in range(section.count):
                    f.write(
                        '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'
                        .format(fastq_file.extract_num, cluster + next_cluster,
                                file_num))
                    f.write(ref_nuc_section + '\n')
                    f.write('+\n')
                    f.write(phred_scores + '\n')
                next_cluster += section.count
Exemple #40
0
class ConvertPrelimTest(unittest.TestCase):
    def setUp(self):
        self.projects = ProjectConfig()
        self.projects.load(StringIO("""\
            {
              "regions": {
                "R1-seed": {
                  "seed_group": "main",
                  "reference": ["ACTAAAGGG"]
                },
                "R2-seed": {
                  "seed_group": "main",
                  "reference": ["ACTAAAGGGAAA"]
                }
              }
            }
            """))
        self.sam_file = StringIO()
        self.remap_counts = StringIO()
        self.remap_counts_writer = DictWriter(
            self.remap_counts,
            ['type', 'filtered_count', 'count'],
            lineterminator=os.linesep)
        self.remap_counts_writer.writeheader()

    def test_simple(self):
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,0,1
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_two_regions(self):
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB
example2,89,R2-seed,1,0,9M,=,1,0,AAAACCTTT,BBBBBBBBB
example3,89,R2-seed,1,0,9M,=,1,0,AAAAACTTT,BBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB
example2\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAACCTTT\tBBBBBBBBB
example3\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAAACTTT\tBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,0,1
prelim R2-seed,0,2
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_long_reads(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,2,2
"""
        expected_seed_counts = {'R1-seed': 2}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_star_region(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3,93,*,*,*,*,*,*,*,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3\t93\t*\t*\t*\t*\t*\t*\t*\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim *,0,1
prelim R1-seed,2,2
"""
        expected_seed_counts = {'R1-seed': 2}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_best_in_group(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R2-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3,89,R1-seed,1,0,54M,=,1,0,\
AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example4,89,R2-seed,1,0,54M,=,1,0,\
AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example5,89,R2-seed,1,0,54M,=,1,0,\
AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example4\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example5\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,2,2
prelim R2-seed,3,3
"""
        expected_seed_counts = {'R2-seed': 3}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_unmapped_read(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,93,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t93\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,1,2
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv,
                                     self.sam_file,
                                     self.remap_counts_writer,
                                     count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)
Exemple #41
0
def build_conseqs(conseqs_file,
                  run,
                  sample_sheet,
                  ok_sample_regions):
    """
    Parses a Pipeline-produced conseq file and builds JSON objects to send
    to QAI.

    @param conseqs_file: An open file that contains the consensus sequences
        from the counts2csf step for all samples in the run.
    @param run: a hash with the attributes of the run record, including a
        sequencing summary of all the samples and their target projects
    @param sample_sheet: The data parsed from the sample sheet.
    @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that
        were given a good score by the pipeline.
    @return an array of JSON hashes, one for each conseq.
    """

    result = []
    ss = sample_sheet
    sequencings = run['sequencing_summary']
    conseqs_csv = csv.DictReader(conseqs_file)
    # ss["Data"] is keyed by (what should be) the FASTQ
    # filename, which looks like
    #
    # [sample name with ; and _ replaced by -]_S[sample number].
    #
    # Meanwhile, entries in conseqs_file have a "sample" field holding
    # just the sample name (also with ; and _ replaced).  We make a
    # lookup table to get the FASTQ filename just from the first part.
    # This will make subsequent steps easier (avoids having to do a
    # search through a list/dict of dicts).
    # FASTQ_lookup = {}
    # filename_re = re.compile("(.+)_S.+")
    # for fastq_filename in ss["Data"]:
    #     sample_name = filename_re.match(fastq_filename).group(1)
    #     FASTQ_lookup[sample_name] = fastq_filename

    projects = ProjectConfig.loadDefault()
    target_regions = set()  # set([(project_name, tags)])
    for entry in sequencings:
        seeds = projects.getProjectSeeds(entry['target_project'])
        for seed in seeds:
            target_regions.add((entry['tag'], seed))

    for row in conseqs_csv:
        # Each row of this file looks like:
        # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence
        # We want to take the "sample" entry and get the corresponding
        # original Sample_Name from the sample sheet. In version 2, this
        # looks like [sample name]~[project name]#[...]
        # In version 1, this looked like [sample name]~[project name]#[...]
        # but both ; and _ got garbled by the MiSeq instrument itself.
        # Thus we have to work around it.
        fastq_filename = row["sample"]
        sample_info = ss["Data"][fastq_filename]
        orig_sample_name = sample_info["orig_sample_name"]
        sample_tags = sample_info["tags"]
        # FIXME if row["sequence"] is blank we replace it with a dash.
        # Need Conan to make that row blank-able.
        curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-"
        sample_region = (fastq_filename, row["region"], row["q-cutoff"])
        ok_region = sample_region in ok_sample_regions
        is_target_region = (sample_tags, row["region"]) in target_regions
        ok_for_release = ok_region and is_target_region
        result.append({"samplename": orig_sample_name,
                       # July 9, 2014: we can't do this properly right now
                       # without a lookup table that is yet to be fully
                       # defined.
                       "testcode": None,
                       "conseq_cutoff": row["consensus-percent-cutoff"],
                       "region": row["region"],
                       "qcutoff": float(row["q-cutoff"]),
                       "snum": fastq_filename.split('_')[-1],
                       "seq": curr_seq,
                       "ok_for_release": ok_for_release})
    return result
Exemple #42
0
class CoveragePlotsTest(TestCase):
    def setUp(self):
        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
        config_json = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [
            {
              "end_pos": null,
              "start_pos": 1
            },
            {
              "end_pos": null,
              "start_pos": 3
            }
          ],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        self.config = ProjectConfig()
        self.config.load(config_json)

    @patch('matplotlib.pyplot.savefig')
    @patch('micall.core.project_config.ProjectConfig.loadScoring')
    def test_simple(self, config_mock, savefig_mock):
        config_mock.return_value = self.config
        amino_csv = StringIO("""\
seed,region,q-cutoff,query.aa.pos,refseq.aa.pos,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
R1-seed,R1,15,100,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
R1-seed,R1,15,101,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0
R1-seed,R1,15,102,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0
""")
        expected_scores = """\
project,region,seed,q.cut,min.coverage,which.key.pos,off.score,on.score
R1,R1,R1-seed,15,5,1,-1,1
R1-and-R2,R1,R1-seed,15,5,1,-1,1
"""
        scores_csv = StringIO()
        amino_csv.name = 'E1234.amino.csv'
        expected_calls = [
            call('E1234.R1.R1.png'),
            call('E1234.R1-and-R2.R1.png')
        ]

        coverage_plot(amino_csv, coverage_scores_csv=scores_csv)

        self.assertEqual(expected_calls, savefig_mock.mock_calls)
        self.assertEqual(expected_scores, scores_csv.getvalue())
Exemple #43
0
def main():
    args = parse_args()
    projects = ProjectConfig.loadDefault()
    for sample_name in args.sample:
        process_file(sample_name, projects, args)
    print('Done.')
Exemple #44
0
def main():
    args = parse_args()
    project_config = ProjectConfig.loadDefault()
    with open('../project_scoring.json', 'rU') as scoring_file:
        scoring_config = json.load(scoring_file)
    with qai_helper.Session() as session:
        session.login(args.qai_server,
                      args.qai_user,
                      args.qai_password)

        pipelines = session.get_json(
            "/lab_miseq_pipelines?version=" + args.pipeline_version,
            retries=0)
        if pipelines:
            raise RuntimeError('Pipeline {} already exists.'.format(
                args.pipeline_version))

        seed_groups = session.get_json("/lab_miseq_seed_groups")
        seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups))
        old_regions = session.get_json("/lab_miseq_regions", retries=0)
        regions = dict(((region['name'], region) for region in old_regions))
        for region_name, region_data in project_config.config['regions'].items():
            region = regions.get(region_name)
            if region is None:
                seed_group_name = region_data['seed_group']
                seed_group_id = seed_group_ids.get(seed_group_name)
                if seed_group_id is None and seed_group_name:
                    seed_group = session.post_json("/lab_miseq_seed_groups",
                                                   {'name': seed_group_name})
                    seed_group_id = seed_group['id']
                    seed_group_ids[seed_group_name] = seed_group_id
                region = session.post_json(
                    "/lab_miseq_regions",
                    {'name': region_name,
                     'is_nucleotide': region_data['is_nucleotide'],
                     'reference': ''.join(region_data['reference']),
                     'seed_group_id': seed_group_id})
                regions[region_name] = region

        pipeline = session.post_json("/lab_miseq_pipelines",
                                     {'version': args.pipeline_version})
        pipeline_id = pipeline['id']

        old_projects = session.get_json("/lab_miseq_projects", retries=0)
        projects = dict(((project['name'], project) for project in old_projects))
        for project_name, project_data in project_config.config['projects'].items():
            project = projects.get(project_name)
            if project is None:
                project = session.post_json(
                    "/lab_miseq_projects",
                    {'name': project_name,
                     'max_variants': project_data['max_variants']})
            project_version = session.post_json("/lab_miseq_project_versions",
                                                {'pipeline_id': pipeline_id,
                                                 'project_id': project['id']})
            for i, region_data in enumerate(project_data['regions']):
                scoring_data = scoring_config['projects'][project_name]['regions'][i]
                coordinate_region = regions[region_data['coordinate_region']]
                seed_region = regions[region_data['seed_region_names'][0]]
                seed_group_id = seed_region['seed_group_id']
                project_region = session.post_json(
                    "/lab_miseq_project_regions",
                    {'project_version_id': project_version['id'],
                     'coordinate_region_id': coordinate_region['id'],
                     'min_coverage1': scoring_data['min_coverage1'],
                     'min_coverage2': scoring_data['min_coverage2'],
                     'min_coverage3': scoring_data['min_coverage3'],
                     'seed_group_id': seed_group_id})

                for key_position in scoring_data['key_positions']:
                    session.post_json("/lab_miseq_key_positions",
                                      {'project_region_id': project_region['id'],
                                       'start_pos': key_position['start_pos'],
                                       'end_pos': key_position['end_pos']})

    print("Done.")
Exemple #45
0
    def setUp(self):
        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
        config_json = StringIO("""\
{
  "projects": {
    "R1": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        }
      ]
    },
    "R1-and-R2": {
      "max_variants": 0,
      "regions": [
        {
          "coordinate_region": "R1",
          "coordinate_region_length": 3,
          "key_positions": [
            {
              "end_pos": null,
              "start_pos": 1
            },
            {
              "end_pos": null,
              "start_pos": 3
            }
          ],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R1-seed"
          ]
        },
        {
          "coordinate_region": "R2",
          "coordinate_region_length": 1,
          "key_positions": [],
          "min_coverage1": 10,
          "min_coverage2": 50,
          "min_coverage3": 100,
          "seed_region_names": [
            "R2-seed"
          ]
        }
      ]
    }
  }
}
""")
        self.config = ProjectConfig()
        self.config.load(config_json)
Exemple #46
0
class ConvertPrelimTest(unittest.TestCase):
    def setUp(self):
        self.projects = ProjectConfig()
        self.projects.load(
            StringIO("""\
            {
              "regions": {
                "R1-seed": {
                  "seed_group": "main",
                  "reference": ["ACTAAAGGG"]
                },
                "R2-seed": {
                  "seed_group": "main",
                  "reference": ["ACTAAAGGGAAA"]
                }
              }
            }
            """))
        self.sam_file = StringIO()
        self.remap_counts = StringIO()
        self.remap_counts_writer = DictWriter(
            self.remap_counts, ['type', 'filtered_count', 'count'],
            lineterminator=os.linesep)
        self.remap_counts_writer.writeheader()

    def test_simple(self):
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,0,1
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_two_regions(self):
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB
example2,89,R2-seed,1,0,9M,=,1,0,AAAACCTTT,BBBBBBBBB
example3,89,R2-seed,1,0,9M,=,1,0,AAAAACTTT,BBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB
example2\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAACCTTT\tBBBBBBBBB
example3\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAAACTTT\tBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,0,1
prelim R2-seed,0,2
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_long_reads(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,2,2
"""
        expected_seed_counts = {'R1-seed': 2}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_star_region(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3,93,*,*,*,*,*,*,*,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3\t93\t*\t*\t*\t*\t*\t*\t*\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim *,0,1
prelim R1-seed,2,2
"""
        expected_seed_counts = {'R1-seed': 2}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_best_in_group(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,89,R2-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3,89,R1-seed,1,0,54M,=,1,0,\
AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example4,89,R2-seed,1,0,54M,=,1,0,\
AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example5,89,R2-seed,1,0,54M,=,1,0,\
AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example3\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example4\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example5\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\
AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,2,2
prelim R2-seed,3,3
"""
        expected_seed_counts = {'R2-seed': 3}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)

    def test_unmapped_read(self):
        self.maxDiff = None
        prelim_csv = StringIO("""\
qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual
example1,89,R1-seed,1,0,54M,=,1,0,\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2,93,R1-seed,1,0,54M,=,1,0,\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
""")
        count_threshold = 2
        expected_sam_file = """\
@HD	VN:1.0	SO:unsorted
@SQ	SN:R1-seed	LN:9
@SQ	SN:R2-seed	LN:12
@PG	ID:bowtie2	PN:bowtie2	VN:2.2.3	CL:""
example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
example2\t93\tR1-seed\t1\t0\t54M\t=\t1\t0\t\
AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\
BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
"""
        expected_remap_counts = """\
type,filtered_count,count
prelim R1-seed,1,2
"""
        expected_seed_counts = {}

        seed_counts = convert_prelim(prelim_csv, self.sam_file,
                                     self.remap_counts_writer, count_threshold,
                                     self.projects)

        self.assertEqual(expected_sam_file, self.sam_file.getvalue())
        self.assertEqual(expected_remap_counts, self.remap_counts.getvalue())
        self.assertEqual(expected_seed_counts, seed_counts)