def setUp(self): self.config = ProjectConfig() self.defaultJsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1 and R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """)
def map_references(contig_ref_name: str, coordinates_name: str, projects: ProjectConfig) -> typing.Mapping[int, int]: ref_seq = projects.getReference(contig_ref_name) coordinates_seq = projects.getReference(coordinates_name) aligned_coordinates, aligned_ref, _ = align_nucs(coordinates_seq, ref_seq) mapped_positions = {} coordinate_pos = ref_pos = 0 for coordinate_nuc, ref_nuc in zip(aligned_coordinates, aligned_ref): if coordinate_nuc != '-': coordinate_pos += 1 if ref_nuc != '-': ref_pos += 1 mapped_positions[ref_pos] = coordinate_pos return mapped_positions
def find_coord_pos(projects: ProjectConfig, coord_name: str, start_pos: int = None, end_pos: int = None): coord_seq = projects.getReference(coord_name) if start_pos is None: start_pos = 1 if end_pos is None: end_pos = len(coord_seq) + 1 if projects.config['regions'][coord_name]['is_nucleotide']: # Already have a nucleotide sequence, nothing to do. return coord_name, start_pos, end_pos gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
def __init__(self, file=None, rules_yaml=None, genotype=None, references=None): """ Load ASI rules from a file or file object. """ if references is None: projects = ProjectConfig.loadDefault() references = projects.getAllReferences() with WILD_TYPES_PATH.open() as wild_types_file: wild_types = safe_load(wild_types_file) references.update(wild_types) self.stds = { name if name != 'INT' else 'IN': ref for name, ref in references.items()} # Algorithm info self.alg_version = '' self.alg_name = '' # definitions self.gene_def = {} # {code: [drug_class_code]} self.level_def = {} # {'1': 'Susceptible'} self.drug_class = defaultdict(list) # {code: [drug_code]} self.global_range = [] # [ ['-INF', '9', '1'] , ...] #first two are the range, the third one is the res level self.comment_def = {} # {code: comment_text} self.drugs = {} # {code: (name, [condition, [(action_type, action_value)]])} self.mutation_comments = [] # maybe skip for now? We don't really use this atm. if file is not None: self.load_xml(file) elif rules_yaml is not None: self.load_yaml(rules_yaml, genotype)
def build_config(project, sequences): projects = ProjectConfig() projects.config = { 'projects': { project: { 'regions': [ { 'seed_region_names': list(sequences.keys()) } ] } }, 'regions': {name: {'reference': [sequence]} for name, sequence in sequences.items()} } return projects
def extract_v3loop_ref(): ref_filename = os.path.join(os.path.dirname(__file__), 'v3loop_ref.txt') try: with open(ref_filename) as f: v3loop_ref = f.read() except FileNotFoundError: project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) with open(ref_filename, 'w') as f: f.write(v3loop_ref) return v3loop_ref
def load_references(): projects = ProjectConfig.loadDefault() references = {} # {(genotype, region): Reference} for ref_name, sequence in projects.getAllReferences().items(): match = re.match(r'HCV(.*?)-.*-([^-]+)$', ref_name) if match: genotype = match.group(1) region = match.group(2) if region in HCV_REGIONS: reference = Reference(ref_name, sequence) references[(genotype, region)] = reference if genotype == '6': references[('6E', region)] = reference return references
def setUp(self): self.projects = ProjectConfig() self.projects.load( StringIO("""\ { "regions": { "R1-seed": { "seed_group": "main", "reference": ["ACTAAAGGG"] }, "R2-seed": { "seed_group": "main", "reference": ["ACTAAAGGGAAA"] } } } """)) self.sam_file = StringIO() self.remap_counts = StringIO() self.remap_counts_writer = DictWriter( self.remap_counts, ['type', 'filtered_count', 'count'], lineterminator=os.linesep) self.remap_counts_writer.writeheader()
def main(): fastq_files = [ FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'), )), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'), )) ] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end - ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
def setUp(self): self.defaultJsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig()
def read_contigs(contigs_csv, excluded_seeds=None): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 contig_groups = defaultdict( list) # {group_ref_name: [seq, index, index...]} conseqs = {} projects = ProjectConfig.loadDefault() with contigs_csv: contigs_reader = DictReader(contigs_csv) for i, row in reversed(list(enumerate(contigs_reader, 1))): contig_seq = row['contig'] match_fraction = float(row['match']) is_match = 0.25 <= match_fraction is_reversed = match_fraction < 0 if not (ARE_CONTIGS_MERGED and is_match): contig_name = get_contig_name(i, row['ref'], is_match, is_reversed, excluded_seeds) conseqs[contig_name] = contig_seq continue group_ref_name = row['group_ref'] contig_group = contig_groups[group_ref_name] if not contig_group: contig_group.append(projects.getReference(group_ref_name)) contig_group.append(str(i)) group_seq = contig_group[0] agroup, acontig, score = align_it(group_seq, contig_seq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) match = re.match('-*([^-](.*[^-])?)', acontig) start = match.start(1) end = match.end(1) merged_seq = agroup[:start] + contig_seq + agroup[end:] left_trim = len(agroup) - len(agroup.lstrip('-')) right_trim = len(agroup) - len(agroup.rstrip('-')) contig_group[0] = merged_seq[left_trim:-right_trim or None] is_match = True is_reversed = False for group_ref_name, contig_group in contig_groups.items(): (group_seq, *contig_nums) = contig_group prefix = '_'.join(reversed(contig_nums)) contig_name = get_contig_name(prefix, group_ref_name, is_match, is_reversed, excluded_seeds) conseqs[contig_name] = group_seq return conseqs
def fastq_g2p(pssm, fastq1, fastq2, g2p_csv, g2p_summary_csv=None, unmapped1=None, unmapped2=None, aligned_csv=None, min_count=1, min_valid=1, min_valid_percent=0.0, merged_contigs_csv=None): g2p_filename = getattr(g2p_csv, 'name', None) if g2p_filename is None: count_prefix = None else: working_path = os.path.dirname(g2p_csv.name) count_prefix = os.path.join(working_path, 'read_counts') project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) reader = FastqReader(fastq1, fastq2) merged_reads = merge_reads(reader) consensus_builder = ConsensusBuilder() counted_reads = consensus_builder.build(merged_reads) trimmed_reads = trim_reads(counted_reads, v3loop_ref) mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2) read_counts = count_reads(mapped_reads, count_prefix) if aligned_csv is not None: read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed, v3loop_ref) write_rows(pssm, read_counts, g2p_csv, g2p_summary_csv, min_count, min_valid=min_valid, min_valid_percent=min_valid_percent) if merged_contigs_csv is not None: contig_writer = DictWriter(merged_contigs_csv, ['contig']) contig_writer.writeheader() for consensus in consensus_builder.get_consensus_by_lengths(): unambiguous_consensus = consensus.replace('N', '').replace('-', '') if unambiguous_consensus: contig_writer.writerow(dict(contig=consensus))
def test_duplicated_sars_base_amino(sequence_report): """ Special case for duplicated base in SARS orf1ab. Expect amino sequence AQSFLNRVCG. """ # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads("""\ SARS-CoV-2-seed,15,0,9,0,GCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACAC """) # Repeat is here: ^ # A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*,X,...,coverage expected_text = """\ SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,1,4396,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,4,4397,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,7,4398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,10,4399,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,13,4400,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,16,4401,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,18,4402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,4403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,4404,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,4405,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9""" sequence_report.projects = ProjectConfig.loadDefault() orf1ab_size = len( sequence_report.projects.getReference('SARS-CoV-2-ORF1ab')) nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12')) report_file = StringIO() sequence_report.write_amino_header(report_file) sequence_report.read(aligned_reads) sequence_report.write_amino_counts() report = report_file.getvalue() report_lines = report.splitlines() expected_size = orf1ab_size + nsp12_size + 1 if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') key_lines = report_lines[4396:4406] key_report = '\n'.join(key_lines) assert key_report == expected_text
def main(): project_config = ProjectConfig.loadDefault() error_count = 0 unchecked_ref_names = set(project_config.getAllReferences().keys()) error_count += check_hcv_seeds(project_config, unchecked_ref_names) error_count += check_hcv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_seeds(project_config, unchecked_ref_names) error_count += check_hiv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_wild_types(project_config) error_count += check_hla_seeds(project_config, unchecked_ref_names) error_count += check_hla_coordinates(project_config, unchecked_ref_names) if not unchecked_ref_names: print('No unchecked refs.') else: print(fill_report(f'Unchecked refs: ' f'{", ".join(sorted(unchecked_ref_names))}')) error_count += len(unchecked_ref_names) print(f'Total errors: {error_count}.')
def test_duplicated_sars_base_nuc(sequence_report): """ Make sure duplicated base in SARS isn't duplicated in nuc.csv. """ # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads("""\ SARS-CoV-2-seed,15,0,9,10,ACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCG """) # A,C,G,T,N,...,coverage expected_section = """\ SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,21,13198,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,22,13199,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,23,13200,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,24,13201,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,25,13202,9,0,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,26,13203,0,9,0,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,27,13204,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,28,13205,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,29,13206,0,0,9,0,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,30,13207,0,0,0,9,0,0,0,0,0,9 SARS-CoV-2-seed,SARS-CoV-2-ORF1ab,15,31,13208,0,0,0,9,0,0,0,0,0,9""" sequence_report.projects = ProjectConfig.loadDefault() orf1ab_size = len( sequence_report.projects.getReference('SARS-CoV-2-ORF1ab')) nsp12_size = len(sequence_report.projects.getReference('SARS-CoV-2-nsp12')) report_file = StringIO() sequence_report.write_nuc_header(report_file) sequence_report.read(aligned_reads) sequence_report.write_nuc_counts() report = report_file.getvalue() report_lines = report.splitlines() header_size = 1 skipped_rows = 2 expected_size = (orf1ab_size + nsp12_size) * 3 + header_size - skipped_rows if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') key_lines = report_lines[13198:13209] key_report = '\n'.join(key_lines) assert key_report == expected_section
def main(): # find_best_match_for_pssm() sequences = fetch_alignment_sequences( 2004, 'CON', # Consensus/Ancestral 'ENV') consensus = sequences['CON_OF_CONS'].replace('-', '').upper() project_config = ProjectConfig.loadDefault() ref_names = set(project_config.getAllReferences().keys()) new_sequences = fetch_alignment_sequences('2015', 'COM') consensus_accession = 'Consensus' assert consensus_accession not in new_sequences, sorted( new_sequences.keys()) new_sequences[consensus_accession] = consensus for line in compare_config('HIV', project_config, new_sequences, ref_names): print(line, end='') print('Unchecked refs: ' + ', '.join(sorted(ref_names)))
def __init__(self, file=None, rules_yaml=None, genotype=None, references=None, backup_genotype=None): """ Load ASI rules from a file or file object. """ if references is None: projects = ProjectConfig.loadDefault() references = projects.getAllReferences() with WILD_TYPES_PATH.open() as wild_types_file: wild_types = safe_load(wild_types_file) references.update(wild_types) self.stds = { name if name != 'INT' else 'IN': ref for name, ref in references.items() } # Algorithm info self.alg_version = '' self.alg_name = '' # definitions self.gene_def = {} # {code: [drug_class_code]} self.level_def = {} # {'1': 'Susceptible'} self.drug_class = defaultdict(list) # {code: [drug_code]} self.global_range = [ ] # [ ['-INF', '9', '1'] , ...] #first two are the range, the third one is the res level self.comment_def = {} # {code: comment_text} self.drugs = { } # {code: (name, [condition, [(action_type, action_value)]])} self.mutation_comments = [ ] # maybe skip for now? We don't really use this atm. if file is not None: self.load_xml(file) elif rules_yaml is not None: self.load_yaml(rules_yaml, genotype, backup_genotype)
def fastq_g2p(pssm, fastq1, fastq2, g2p_csv, g2p_summary_csv=None, unmapped1=None, unmapped2=None, aligned_csv=None, min_count=1, min_valid=1, min_valid_percent=0.0): g2p_filename = getattr(g2p_csv, 'name', None) if g2p_filename is None: count_prefix = None else: working_path = os.path.dirname(g2p_csv.name) count_prefix = os.path.join(working_path, 'read_counts') project_config = ProjectConfig.loadDefault() hiv_seed = project_config.getReference(G2P_SEED_NAME) coordinate_ref = project_config.getReference(COORDINATE_REF_NAME) v3loop_ref = extract_target(hiv_seed, coordinate_ref) reader = FastqReader(fastq1, fastq2) merged_reads = merge_reads(reader) trimmed_reads = trim_reads(merged_reads, v3loop_ref) mapped_reads = write_unmapped_reads(trimmed_reads, unmapped1, unmapped2) read_counts = count_reads(mapped_reads, count_prefix) if aligned_csv is not None: read_counts = write_aligned_reads(read_counts, aligned_csv, hiv_seed, v3loop_ref) write_rows(pssm, read_counts, g2p_csv, g2p_summary_csv, min_count, min_valid=min_valid, min_valid_percent=min_valid_percent)
def write_nuc_mutations(nuc_csv: typing.TextIO, nuc_mutations_csv: typing.TextIO): nuc_rows = DictReader(nuc_csv) mutations_writer = DictWriter(nuc_mutations_csv, ['seed', 'region', 'wt', 'refseq_nuc_pos', 'var', 'prevalence'], lineterminator=os.linesep) mutations_writer.writeheader() for seed, seed_rows in groupby(nuc_rows, itemgetter('seed')): if seed != 'SARS-CoV-2-seed': continue landmark_reader = LandmarkReader.load() projects = ProjectConfig.loadDefault() for region_name, region_rows in groupby(seed_rows, itemgetter('region')): region = landmark_reader.get_gene(seed, region_name) seed_seq = projects.getReference(seed) ref_seq = seed_seq[region['start']-1:region['end']] for row in region_rows: nuc_pos = int(row['refseq.nuc.pos']) wild_type = ref_seq[nuc_pos-1] coverage = int(row['coverage']) if coverage == 0: continue for nuc in 'ACGT': if nuc == wild_type: continue nuc_count = int(row[nuc]) prevalence = nuc_count / coverage if prevalence >= 0.05: mutations_writer.writerow(dict(seed=seed, region=region_name, wt=wild_type, refseq_nuc_pos=nuc_pos, var=nuc, prevalence=prevalence))
def main(): project_config = ProjectConfig.loadDefault() error_count = 0 unchecked_ref_names = set(project_config.getAllReferences().keys()) error_count += check_hcv_seeds(project_config, unchecked_ref_names) error_count += check_hcv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_seeds(project_config, unchecked_ref_names) error_count += check_hiv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_wild_types(project_config) error_count += check_hla_seeds(project_config, unchecked_ref_names) error_count += check_hla_coordinates(project_config, unchecked_ref_names) error_count += check_sars_seeds(project_config, unchecked_ref_names) error_count += check_sars_coordinates(project_config, unchecked_ref_names) if not unchecked_ref_names: print('No unchecked refs.') else: print( fill_report(f'Unchecked refs: ' f'{", ".join(sorted(unchecked_ref_names))}')) error_count += len(unchecked_ref_names) print(f'Total errors: {error_count}.')
def load_hcv(seqs): hcv_definitions = DictReader( StringIO("""\ protocol,name,direction,length,h77_pos,sequence HCV WG,oligo dA20,R,20,9418-9437,AAAAAAAAAAAAAAAAAAAA ,Pr3,R,30,8616-8645,GGCGGAATTCCTGGTCATAGCCTCCGTGAA ,1abGENF1bp,F,28,266-293,GGGTCGCGAAAGGCCTTGTGGTACTGCC ,TIM-Pr3,R,30,8616-8645,CAGGAAACAGCTATGACGGCGGAATTCCTGGTCATAGCCTCCGTGAA ,1abGENF2,F,30,286-315,GTACTGCCTGATAGGGTGCTTGCGAGTGCC ,Pr6,R,30,8611-8640,AATTCCTGGTCATAGCCTCCGTGAAGACTC HCV miDi,Pr1,F,31,8245-8275,TGGGGTTCGCGTATGATACCCGCTGCTTTGA ,Pr2,F,31,8245-8275,TGGGGTTTTCTTACGACACCAGGTGCTTTGA ,oligo dA20-TIM,R,20,9418-9437,CAGGAAACAGCTATGACAAAAAAAAAAAAAAAAAAAA ,Pr4,F,29,8253-8281,CCGTATGATACCCGCTGCTTTGACTCAAC ,Pr5,F,29,8253-8281,TCCTACGACACCAGGTGCTTTGATTCAAC ,TIM,R,,1-0,CAGGAAACAGCTATGAC """)) projects = ProjectConfig.loadDefault() h77 = projects.getReference('HCV-1a') is_comparing = True differ = Differ() for row in hcv_definitions: name = 'HCV ' + row['name'] start, end = (int(pos) for pos in row['h77_pos'].split('-')) primer = SeqRecord(Seq(row['sequence']), name, description='') complement = primer.reverse_complement(id=primer.id, description='') direction = row['direction'] if direction == 'F': seqs['left'].append(primer) else: seqs['right'].append(primer) primer, complement = complement, primer h77_section = Seq(h77[start - 1:end]) if is_comparing and primer.seq != h77_section: print(name, 'does not match.') diffs = differ.compare([str(primer.seq) + '\n'], [str(h77_section) + '\n']) print(*diffs, sep='')
def setUp(self): self.projects = ProjectConfig() self.projects.load(StringIO("""\ { "regions": { "R1-seed": { "seed_group": "main", "reference": ["ACTAAAGGG"] }, "R2-seed": { "seed_group": "main", "reference": ["ACTAAAGGGAAA"] } } } """)) self.sam_file = StringIO() self.remap_counts = StringIO() self.remap_counts_writer = DictWriter( self.remap_counts, ['type', 'filtered_count', 'count'], lineterminator=os.linesep) self.remap_counts_writer.writeheader()
def setUp(self): self.defaultJsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig()
class ProjectConfigurationTest(unittest.TestCase): def setUp(self): self.defaultJsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig() def testConvert(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testSharedRegions(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R1 and R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] }, { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG >R2-seed TTT """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testUnusedRegion(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testExcludeSeeds(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] }, "R3": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R3-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] }, "R3-seed": { "is_nucleotide": true, "reference": [ "TAG" ] } } } """) expected_fasta = """\ >R2-seed TTT """ fasta = StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta, excluded_seeds=['R1-seed', 'R3-seed']) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testExcludeUnknownSeed(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta, excluded_seeds=['R99-seed']) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testDuplicateReference(self): jsonIO = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1a-seed", "R1b-seed"] } ] } }, "regions": { "R1a-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] }, "R1b-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] } } } """) fasta = StringIO() self.config.load(jsonIO) self.assertRaisesRegex(RuntimeError, "Duplicate references: R1a-seed and R1b-seed.", self.config.writeSeedFasta, fasta) def testGetReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_ref = 'ACTGAAAGGG' seed_ref = self.config.getReference(seed_name) self.assertSequenceEqual(expected_ref, seed_ref) def testGetCoordinateReferences(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_refs = {'R1': 'RWNNWR'} coordinate_refs = self.config.getCoordinateReferences(seed_name) self.assertDictEqual(expected_refs, coordinate_refs) def testGetAllReferences(self): expected_references = {'R1-seed': 'ACTGAAAGGG', 'R1': 'RWNNWR'} self.config.load(self.defaultJsonIO) references = self.config.getAllReferences() self.assertEqual(expected_references, references) def testUnknownReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R-unknown' self.assertRaises(KeyError, self.config.getReference, seed_name) def testMaxVariants(self): self.config.load(self.defaultJsonIO) coordinate_region_name = 'R1' self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsUnusedRegion(self): jsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R2' self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsTwoProjects(self): """ If two projects specify a maximum for the same coordinate region, use the bigger of the two. """ jsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 9, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] }, "R1-and-R2": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] }, { "coordinate_region": "R2", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R1' self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name)) def testReload(self): jsonIO1 = StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] } } } """) jsonIO2 = StringIO("""\ { "projects": { "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R2-seed": { "is_nucleotide": true, "reference": [ "GACCTA" ] } } } """) self.config.load(jsonIO1) self.config.load(jsonIO2) self.assertRaises(KeyError, self.config.getReference, "R1-seed") self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed")) def testProjectSeeds(self): expected_seeds = set(['R1-seed']) self.config.load(self.defaultJsonIO) seeds = self.config.getProjectSeeds('R1') self.assertSetEqual(expected_seeds, seeds) def testSeedGroup(self): expected_group = "R1-seeds" self.config.load(self.defaultJsonIO) group = self.config.getSeedGroup('R1-seed') self.assertEqual(expected_group, group)
class ProjectConfigurationProjectRegionsTest(unittest.TestCase): def setUp(self): self.config = ProjectConfig() self.defaultJsonIO = StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1 and R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) def testProjectRegions(self): expected_project_regions = [{ "project_name": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100 }, { "project_name": "R1 and R2", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100 }] self.config.load(self.defaultJsonIO) project_regions = list(self.config.getProjectRegions('R1-seed', 'R1')) self.assertEqual(expected_project_regions, project_regions) def testProjectExcluded(self): excluded_projects = ['R1'] expected_project_regions = [{ "project_name": "R1 and R2", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100 }] self.config.load(self.defaultJsonIO) project_regions = list( self.config.getProjectRegions('R1-seed', 'R1', excluded_projects)) self.assertEqual(expected_project_regions, project_regions)
def setUp(self): self.addTypeEqualityFunc(str, self.assertMultiLineEqual) config_json = StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1-and-R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [ { "end_pos": null, "start_pos": 1 }, { "end_pos": null, "start_pos": 3 } ], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) self.config = ProjectConfig() self.config.load(config_json)
def aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, callback=None, coverage_summary_csv=None, clipping_csv=None, conseq_ins_csv=None, g2p_aligned_csv=None, remap_conseq_csv=None, conseq_region_csv=None): """ Analyze aligned reads for nucleotide and amino acid frequencies. Generate consensus sequences. @param aligned_csv: Open file handle containing aligned reads (from sam2aln) @param nuc_csv: Open file handle to write nucleotide frequencies. @param amino_csv: Open file handle to write amino acid frequencies. @param coord_ins_csv: Open file handle to write insertions relative to coordinate reference. @param conseq_csv: Open file handle to write consensus sequences. @param failed_align_csv: Open file handle to write sample consensus sequences that failed to align to the coordinate reference. @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param coverage_summary_csv: Open file handle to write coverage depth. @param clipping_csv: Open file handle containing soft clipping counts @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p) @param remap_conseq_csv: Open file handle containing consensus sequences from the remap step. @param conseq_region_csv: Open file handle to write consensus sequences split into regions. """ # load project information projects = ProjectConfig.loadDefault() # initialize reporter classes with InsertionWriter(coord_ins_csv) as insert_writer: report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS) report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE report.write_amino_header(amino_csv) report.write_consensus_header(conseq_csv) report.write_consensus_regions_header(conseq_region_csv) report.write_failure_header(failed_align_csv) report.write_nuc_header(nuc_csv) if coverage_summary_csv is None: coverage_summary = coverage_writer = None else: coverage_writer = csv.DictWriter(coverage_summary_csv, ['avg_coverage', 'coverage_region', 'region_width'], lineterminator=os.linesep) coverage_writer.writeheader() coverage_summary = {} if callback: aligned_filename = getattr(aligned_csv, 'name', None) if aligned_filename: file_size = os.stat(aligned_filename).st_size report.enable_callback(callback, file_size) if clipping_csv is not None: report.read_clipping(clipping_csv) if conseq_ins_csv is not None: report.read_insertions(conseq_ins_csv) if remap_conseq_csv is not None: report.read_remap_conseqs(remap_conseq_csv) report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary) if coverage_summary_csv is not None: if coverage_summary: coverage_writer.writerow(coverage_summary)
class ProjectConfigurationTest(unittest.TestCase): def setUp(self): self.defaultJsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 5, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"], "id": 10042 } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ], "seed_group": "R1-seeds" }, "R1": { "is_nucleotide": false, "reference": [ "RWN", "NWR" ], "seed_group": null } } } """) self.config = ProjectConfig() def testConvert(self): expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO.StringIO() self.config.load(self.defaultJsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testSharedRegions(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] }, "R1 and R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] }, { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG >R2-seed TTT """ fasta = StringIO.StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testUnusedRegion(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R2-seed": { "is_nucleotide": true, "reference": [ "TTT" ] } } } """) expected_fasta = """\ >R1-seed ACTGAAAGGG """ fasta = StringIO.StringIO() self.config.load(jsonIO) self.config.writeSeedFasta(fasta) self.assertMultiLineEqual(expected_fasta, fasta.getvalue()) def testDuplicateReference(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1a-seed", "R1b-seed"] } ] } }, "regions": { "R1a-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] }, "R1b-seed": { "is_nucleotide": true, "reference": [ "ACTAAAGGG" ] } } } """) fasta = StringIO.StringIO() self.config.load(jsonIO) self.assertRaisesRegexp(RuntimeError, "Duplicate references: R1a-seed and R1b-seed.", self.config.writeSeedFasta, fasta) def testGetReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_ref = 'ACTGAAAGGG' seed_ref = self.config.getReference(seed_name) self.assertSequenceEqual(expected_ref, seed_ref) def testGetCoordinateReferences(self): self.config.load(self.defaultJsonIO) seed_name = 'R1-seed' expected_refs = {'R1': 'RWNNWR'} coordinate_refs = self.config.getCoordinateReferences(seed_name) self.assertDictEqual(expected_refs, coordinate_refs) def testUnknownReference(self): self.config.load(self.defaultJsonIO) seed_name = 'R-unknown' self.assertRaises(KeyError, self.config.getReference, seed_name) def testMaxVariants(self): self.config.load(self.defaultJsonIO) coordinate_region_name = 'R1' self.assertEqual(5, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsUnusedRegion(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R2' self.assertEqual(0, self.config.getMaxVariants(coordinate_region_name)) def testMaxVariantsTwoProjects(self): """ If two projects specify a maximum for the same coordinate region, use the bigger of the two. """ jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 9, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] } ] }, "R1-and-R2": { "max_variants": 2, "regions": [ { "coordinate_region": "R1", "seed_region_names": ["R1-seed"] }, { "coordinate_region": "R2", "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] }, "R1": { "is_nucleotide": false, "reference": [ "NSFW" ] }, "R2": { "is_nucleotide": false, "reference": [ "RSW" ] } } } """) self.config.load(jsonIO) coordinate_region_name = 'R1' self.assertEqual(9, self.config.getMaxVariants(coordinate_region_name)) def testReload(self): jsonIO1 = StringIO.StringIO("""\ { "projects": { "R1": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R1-seed"] } ] } }, "regions": { "R1-seed": { "is_nucleotide": true, "reference": [ "ACTGAAA", "GGG" ] } } } """) jsonIO2 = StringIO.StringIO("""\ { "projects": { "R2": { "regions": [ { "coordinate_region": null, "seed_region_names": ["R2-seed"] } ] } }, "regions": { "R2-seed": { "is_nucleotide": true, "reference": [ "GACCTA" ] } } } """) self.config.load(jsonIO1) self.config.load(jsonIO2) self.assertRaises(KeyError, self.config.getReference, "R1-seed") self.assertSequenceEqual("GACCTA", self.config.getReference("R2-seed")) def testProjectSeeds(self): expected_seeds = set(['R1-seed']) self.config.load(self.defaultJsonIO) seeds = self.config.getProjectSeeds('R1') self.assertSetEqual(expected_seeds, seeds) def testSeedGroup(self): expected_group = "R1-seeds" self.config.load(self.defaultJsonIO) group = self.config.getSeedGroup('R1-seed') self.assertEqual(expected_group, group) def testProjectRegions(self): jsonIO = StringIO.StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1 and R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) expected_project_regions = [{"project_name": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100}, {"project_name": "R1 and R2", "coordinate_region_length": 3, "key_positions": [1, 3], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100}] self.config.load(jsonIO) project_regions = list(self.config.getProjectRegions('R1-seed', 'R1')) self.assertEqual(expected_project_regions, project_regions)
def main(): args = parse_args() project_config = ProjectConfig.loadDefault() scoring_path = Path(__file__).parent.parent / 'project_scoring.json' with scoring_path.open() as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + args.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( args.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") # noinspection PyTypeChecker seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config['regions'].items( ): ref_seq = ''.join(region_data['reference']) region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ref_seq, 'seed_group_id': seed_group_id }) regions[region_name] = region elif region['reference'] != ref_seq: print("Reference doesn't match:", region_name) if args.update_sequences: region['reference'] = ref_seq session.post_json(f"/lab_miseq_regions/{region['id']}", region) pipeline = session.post_json("/lab_miseq_pipelines", {'version': args.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].items(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print("Done.")
from micall.utils.alignment_wrapper import align_nucs try: # noinspection PyPackageRequirements from mappy import Aligner except ImportError: Aligner = None from micall.utils.fetch_sequences import fetch_by_accession import sys from micall.core.project_config import ProjectConfig REFERENCE = ProjectConfig.loadDefault() REFERENCE = REFERENCE.getReference('SARS-CoV-2-seed') def load_coverage(csv): result = {} with open(csv) as csvfile: reader = DictReader(csvfile) for row in reader: result[int(row['query_nuc_pos'])] = int(row['coverage']) return result BATCH = 'batch_01' ROOT = ( Path('/wow') / BATCH
class CoveragePlotsTest(TestCase): def setUp(self): self.addTypeEqualityFunc(str, self.assertMultiLineEqual) config_json = StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1-and-R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [ { "end_pos": null, "start_pos": 1 }, { "end_pos": null, "start_pos": 3 } ], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) self.config = ProjectConfig() self.config.load(config_json) @patch('matplotlib.pyplot.savefig') @patch('micall.core.project_config.ProjectConfig.loadScoring') def test_simple(self, config_mock, savefig_mock): config_mock.return_value = self.config amino_csv = StringIO("""\ seed,region,q-cutoff,query.aa.pos,refseq.aa.pos,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,* R1-seed,R1,15,100,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 R1-seed,R1,15,101,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0 R1-seed,R1,15,102,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0 """) expected_scores = """\ project,region,seed,q.cut,min.coverage,which.key.pos,off.score,on.score R1,R1,R1-seed,15,5,1,-1,1 R1-and-R2,R1,R1-seed,15,5,1,-1,1 """ scores_csv = StringIO() amino_csv.name = 'E1234.amino.csv' expected_calls = [call('E1234.R1.R1.png'), call('E1234.R1-and-R2.R1.png')] coverage_plot(amino_csv, coverage_scores_csv=scores_csv) self.assertEqual(expected_calls, savefig_mock.mock_calls) self.assertEqual(expected_scores, scores_csv.getvalue())
def main(): fastq_files = [FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 60, 100), FastqSection('HCV2-JFH-1-NS5b', 117, 176, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 57, 116, 100), FastqSection('HCV2-JFH-1-NS5b', 171, 230, 100)), (CodonMutation(159, 'GTC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 313, 100), FastqSection('HCV2-JFH-1-NS5b', 396, 478, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 313, 395, 100), FastqSection('HCV2-JFH-1-NS5b', 479, 561, 100)), (CodonMutation(316, 'AGC'),)), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100),), (CodonMutation(24, 'ATA'),)), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100),), (CodonMutation(24, 'ATA'),))] projects = ProjectConfig.loadDefault() for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos(projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) for mutation in fastq_file.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = (mutation.pos - section.start_pos) * 3 ref_nuc_section[section_pos:section_pos+3] = list(mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * (ref_end-ref_start) file_num = '2' if fastq_file.is_reversed else '1' for cluster in range(section.count): f.write('@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n'.format( fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section+'\n') f.write('+\n') f.write(phred_scores+'\n') next_cluster += section.count
def aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, callback=None, coverage_summary_csv=None, clipping_csv=None, conseq_ins_csv=None, g2p_aligned_csv=None, remap_conseq_csv=None): """ Analyze aligned reads for nucleotide and amino acid frequencies. Generate consensus sequences. @param aligned_csv: Open file handle containing aligned reads (from sam2aln) @param nuc_csv: Open file handle to write nucleotide frequencies. @param amino_csv: Open file handle to write amino acid frequencies. @param coord_ins_csv: Open file handle to write insertions relative to coordinate reference. @param conseq_csv: Open file handle to write consensus sequences. @param failed_align_csv: Open file handle to write sample consensus sequences that failed to align to the coordinate reference. @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param coverage_summary_csv: Open file handle to write coverage depth. @param clipping_csv: Open file handle containing soft clipping counts @param conseq_ins_csv: Open file handle containing insertions relative to consensus sequence @param g2p_aligned_csv: Open file handle containing aligned reads (from fastq_g2p) @param remap_conseq_csv: Open file handle containing consensus sequences from the remap step. """ # load project information projects = ProjectConfig.loadDefault() # initialize reporter classes insert_writer = InsertionWriter(coord_ins_csv) report = SequenceReport(insert_writer, projects, CONSEQ_MIXTURE_CUTOFFS) report.consensus_min_coverage = CONSENSUS_MIN_COVERAGE report.write_amino_header(amino_csv) report.write_consensus_header(conseq_csv) report.write_failure_header(failed_align_csv) report.write_nuc_header(nuc_csv) if coverage_summary_csv is None: coverage_summary = coverage_writer = None else: coverage_writer = csv.DictWriter( coverage_summary_csv, ['avg_coverage', 'coverage_region', 'region_width'], lineterminator=os.linesep) coverage_writer.writeheader() coverage_summary = {} if callback: aligned_filename = getattr(aligned_csv, 'name', None) if aligned_filename: file_size = os.stat(aligned_filename).st_size report.enable_callback(callback, file_size) if clipping_csv is not None: report.read_clipping(clipping_csv) if conseq_ins_csv is not None: report.read_insertions(conseq_ins_csv) if remap_conseq_csv is not None: report.read_remap_conseqs(remap_conseq_csv) report.process_reads(g2p_aligned_csv, aligned_csv, coverage_summary) if coverage_summary_csv is not None: if coverage_summary: coverage_writer.writerow(coverage_summary)
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions): """ Parses a Pipeline-produced conseq file and builds JSON objects to send to QAI. @param conseqs_file: An open file that contains the consensus sequences from the counts2csf step for all samples in the run. @param run: a hash with the attributes of the run record, including a sequencing summary of all the samples and their target projects @param sample_sheet: The data parsed from the sample sheet. @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that were given a good score by the pipeline. @return an array of JSON hashes, one for each conseq. """ result = [] ss = sample_sheet sequencings = run['sequencing_summary'] conseqs_csv = csv.DictReader(conseqs_file) # ss["Data"] is keyed by (what should be) the FASTQ # filename, which looks like # # [sample name with ; and _ replaced by -]_S[sample number]. # # Meanwhile, entries in conseqs_file have a "sample" field holding # just the sample name (also with ; and _ replaced). We make a # lookup table to get the FASTQ filename just from the first part. # This will make subsequent steps easier (avoids having to do a # search through a list/dict of dicts). # FASTQ_lookup = {} # filename_re = re.compile("(.+)_S.+") # for fastq_filename in ss["Data"]: # sample_name = filename_re.match(fastq_filename).group(1) # FASTQ_lookup[sample_name] = fastq_filename projects = ProjectConfig.loadDefault() target_regions = set() # set([(tags, seed_name)]) for entry in sequencings: try: seeds = projects.getProjectSeeds(entry['target_project']) except KeyError: logger.warning('Failed to load project seeds.', exc_info=True) seeds = set() for seed in seeds: target_regions.add((entry['tag'], seed)) for row in conseqs_csv: # Each row of this file looks like: # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence # We want to take the "sample" entry and get the corresponding # original Sample_Name from the sample sheet. In version 2, this # looks like [sample name]~[project name]#[...] # In version 1, this looked like [sample name]~[project name]#[...] # but both ; and _ got garbled by the MiSeq instrument itself. # Thus we have to work around it. fastq_filename = row["sample"] sample_info = ss["Data"][fastq_filename] orig_sample_name = sample_info["orig_sample_name"] sample_tags = sample_info["tags"] # FIXME if row["sequence"] is blank we replace it with a dash. # Need Conan to make that row blank-able. curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-" sample_region = (fastq_filename, row["region"], row["q-cutoff"]) ok_region = sample_region in ok_sample_regions is_target_region = (sample_tags, row["region"]) in target_regions ok_for_release = ok_region and is_target_region result.append({ "samplename": orig_sample_name, # July 9, 2014: we can't do this properly right now # without a lookup table that is yet to be fully # defined. "testcode": None, "conseq_cutoff": row["consensus-percent-cutoff"], "region": row["region"], "qcutoff": float(row["q-cutoff"]), "snum": fastq_filename.split('_')[-1], "seq": curr_seq, "ok_for_release": ok_for_release }) return result
def main(): project_config = ProjectConfig.loadDefault() with open('../project_scoring.json', 'rU') as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(settings.qai_path, settings.qai_user, settings.qai_password) pipelines = session.get_json("/lab_miseq_pipelines?version=" + settings.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( settings.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config[ 'regions'].iteritems(): region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", { 'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ''.join(region_data['reference']), 'seed_group_id': seed_group_id }) regions[region_name] = region pipeline = session.post_json("/lab_miseq_pipelines", {'version': settings.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict( ((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config[ 'projects'].iteritems(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", { 'name': project_name, 'max_variants': project_data['max_variants'] }) project_version = session.post_json("/lab_miseq_project_versions", { 'pipeline_id': pipeline_id, 'project_id': project['id'] }) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name][ 'regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", { 'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id }) for key_position in scoring_data['key_positions']: session.post_json( "/lab_miseq_key_positions", { 'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos'] }) print "Done."
def main(): projects = ProjectConfig.loadDefault() sections_2100hcv_1, sections_2100hcv_2 = make_random_sections( 'HCV1A-H77-NS5a', 1, 300, projects, 400) sections_2100v3_1, sections_2100v3_2 = ([ FastqSection('HIV1-B-FR-K03455-seed', 7056, 7312, 50), FastqSection('HIV1-B-FR-K03455-seed', 7062, 7312, 50) ], [ FastqSection('HIV1-B-FR-K03455-seed', 7123, 7373, 50), FastqSection('HIV1-B-FR-K03455-seed', 7123, 7376, 50) ]) sections_2100hiv_1, sections_2100hiv_2 = make_random_sections( 'RT', 1, 300, projects, 400) sections_2160_1, sections_2160_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 1, 230, projects, mutations=(CodonMutation(159, 'GTC'), )) sections_2160midi_1, sections_2160midi_2 = make_random_sections( 'HCV2-JFH-1-NS5b', 231, 561, projects, mutations=(CodonMutation(316, 'AGC'), )) sections_2170_1a_1, sections_2170_1a_2 = make_random_sections( 'HCV-1a', 6258, 9375) sections_2170_2_1, sections_2170_2_2 = make_random_sections( 'HCV-2a', 6269, 9440) sections_2180_1, sections_2180_2 = make_random_sections( 'HIV1-B-FR-K03455-seed', 6225, 7757) hxb2_ref = projects.getReference('HIV1-B-FR-K03455-seed') projects.config['regions']['HXB2-with-deletion'] = dict( reference=hxb2_ref[617:928] + hxb2_ref[9358:9652], is_nucleotide=True, seed_group=None) sections_2210_1, sections_2210_2 = make_random_sections( 'HXB2-with-deletion', projects=projects) fastq_files = [ FastqFile('2010A-V3LOOP_S3_L001_R1_001.fastq', '2010', False, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2010A-V3LOOP_S3_L001_R2_001.fastq', '2010', True, (FastqSection('HIV1-CON-XX-Consensus-seed', 855, 906, 10), FastqSection('HIV1-CON-XX-Consensus-seed', 912, 960, 10))), FastqFile('2020A-GP41_S4_L001_R1_001.fastq', '2020', False, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2020A-GP41_S4_L001_R2_001.fastq', '2020', True, (FastqSection('HIV1-B-FR-KF716496-seed', 6957, 7065, 10, (CodonMutation(6981, 'GGGATA'), )), )), FastqFile('2040A-HLA-B_S6_L001_R1_001.fastq', '2040', False, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile('2040A-HLA-B_S6_L001_R2_001.fastq', '2040', True, (FastqSection('HLA-B-seed', 201, 315, 80), FastqSection('HLA-B-seed', 201, 315, 20, (CodonMutation(207, 'TCT'), )))), FastqFile( '2070A-PR_S9_L001_R1_001.fastq', '2070', False, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile( '2070A-PR_S9_L001_R2_001.fastq', '2070', True, (FastqSection('PR', 40, 80, 12, (CodonMutation(45, ''), )), FastqSection('PR', 40, 80, 3, (CodonMutation(45, ''), CodonMutation(64, ''))))), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R1_001.fastq', '2100', False, sections_2100hcv_1 + sections_2100v3_1 + sections_2100hiv_1), FastqFile('2100A-HCV-1337B-V3LOOP-PWND-HIV_S12_L001_R2_001.fastq', '2100', True, sections_2100hcv_2 + sections_2100v3_2 + sections_2100hiv_2), FastqFile('2130A-HCV_S15_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 1, 66, 100), FastqSection('HCV2-JFH-1-NS5b', 115, 181, 100, (CodonMutation(159, 'GTC'), )))), FastqFile('2130A-HCV_S15_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 51, 114, 100), FastqSection('HCV2-JFH-1-NS5b', 165, 230, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R1_001.fastq', '2130', False, (FastqSection('HCV2-JFH-1-NS5b', 231, 315, 100), FastqSection('HCV2-JFH-1-NS5b', 398, 485, 100))), FastqFile('2130AMIDI-MidHCV_S16_L001_R2_001.fastq', '2130', True, (FastqSection('HCV2-JFH-1-NS5b', 305, 397, 100, (CodonMutation(316, 'AGC'), )), FastqSection('HCV2-JFH-1-NS5b', 470, 561, 100))), FastqFile('2140A-HIV_S17_L001_R1_001.fastq', '2140', False, (FastqSection('PR', 1, 80, 100, (CodonMutation(24, 'ATA'), )), )), FastqFile('2140A-HIV_S17_L001_R2_001.fastq', '2140', True, (FastqSection('PR', 20, 99, 100, (CodonMutation(24, 'ATA'), )), )), # Simplify with one_contig. FastqFile('2160A-HCV_S19_L001_R1_001.fastq', '2160', False, sections_2160_1), FastqFile('2160A-HCV_S19_L001_R2_001.fastq', '2160', True, sections_2160_2), # Simplify with one_contig. FastqFile('2160AMIDI-MidHCV_S20_L001_R1_001.fastq', '2160', False, sections_2160midi_1), FastqFile('2160AMIDI-MidHCV_S20_L001_R2_001.fastq', '2160', True, sections_2160midi_2), # Simplify with two_long_contigs. FastqFile('2170A-HCV_S21_L001_R1_001.fastq', '2170', False, sections_2170_1a_1 + sections_2170_2_1), FastqFile('2170A-HCV_S21_L001_R2_001.fastq', '2170', True, sections_2170_1a_2 + sections_2170_2_2), FastqFile('2180A-HIV_S22_L001_R1_001.fastq', '2180', False, sections_2180_1), FastqFile('2180A-HIV_S22_L001_R2_001.fastq', '2180', True, sections_2180_2), FastqFile('2190A-SARSCOV2_S23_L001_R1_001.fastq', '2190', False, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2190A-SARSCOV2_S23_L001_R2_001.fastq', '2190', True, (FastqSection('SARS-CoV-2-ORF1ab', 4393, 4429, 50, (CodonMutation(4400, 'TCA'), )), FastqSection('SARS-CoV-2-ORF1ab', 4393, 4430, 50, (CodonMutation(4400, 'TCA'), )))), FastqFile('2200A-SARSCOV2_S24_L001_R1_001.fastq', '2200', False, (FastqSection('SARS-CoV-2-nsp1', 20, 66, 100), )), FastqFile('2200A-SARSCOV2_S24_L001_R2_001.fastq', '2200', True, (FastqSection('SARS-CoV-2-nsp1', 56, 102, 100), )), FastqFile('2210A-NFLHIVDNA_S25_L001_R1_001.fastq', '2210', False, sections_2210_1), FastqFile('2210A-NFLHIVDNA_S25_L001_R2_001.fastq', '2210', True, sections_2210_2) ] for fastq_file in fastq_files: with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: ref_name, ref_start, ref_end = find_coord_pos( projects, section.coord_name, section.start_pos, section.end_pos) ref_nuc_seq = projects.getReference(ref_name) ref_nuc_section = list(ref_nuc_seq[ref_start:ref_end]) is_nucleotide = ((ref_start, ref_end) == (section.start_pos, section.end_pos)) for mutation in section.mutations: if section.start_pos <= mutation.pos <= section.end_pos: section_pos = mutation.pos - section.start_pos if not is_nucleotide: section_pos *= 3 ref_nuc_section[section_pos:section_pos + 3] = list( mutation.codon) ref_nuc_section = ''.join(ref_nuc_section) if fastq_file.is_reversed: ref_nuc_section = reverse_and_complement(ref_nuc_section) phred_scores = 'A' * len(ref_nuc_section) file_num = '2' if fastq_file.is_reversed else '1' # noinspection PyTypeChecker for cluster in range(section.count): f.write( '@M01234:01:000000000-AAAAA:1:1101:{}:{:04} {}:N:0:1\n' .format(fastq_file.extract_num, cluster + next_cluster, file_num)) f.write(ref_nuc_section + '\n') f.write('+\n') f.write(phred_scores + '\n') next_cluster += section.count
class ConvertPrelimTest(unittest.TestCase): def setUp(self): self.projects = ProjectConfig() self.projects.load(StringIO("""\ { "regions": { "R1-seed": { "seed_group": "main", "reference": ["ACTAAAGGG"] }, "R2-seed": { "seed_group": "main", "reference": ["ACTAAAGGGAAA"] } } } """)) self.sam_file = StringIO() self.remap_counts = StringIO() self.remap_counts_writer = DictWriter( self.remap_counts, ['type', 'filtered_count', 'count'], lineterminator=os.linesep) self.remap_counts_writer.writeheader() def test_simple(self): prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,0,1 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_two_regions(self): prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB example2,89,R2-seed,1,0,9M,=,1,0,AAAACCTTT,BBBBBBBBB example3,89,R2-seed,1,0,9M,=,1,0,AAAAACTTT,BBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB example2\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAACCTTT\tBBBBBBBBB example3\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAAACTTT\tBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,0,1 prelim R2-seed,0,2 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_long_reads(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,2,2 """ expected_seed_counts = {'R1-seed': 2} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_star_region(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3,93,*,*,*,*,*,*,*,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3\t93\t*\t*\t*\t*\t*\t*\t*\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim *,0,1 prelim R1-seed,2,2 """ expected_seed_counts = {'R1-seed': 2} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_best_in_group(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R2-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3,89,R1-seed,1,0,54M,=,1,0,\ AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example4,89,R2-seed,1,0,54M,=,1,0,\ AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example5,89,R2-seed,1,0,54M,=,1,0,\ AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example4\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example5\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,2,2 prelim R2-seed,3,3 """ expected_seed_counts = {'R2-seed': 3} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_unmapped_read(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,93,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t93\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,1,2 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts)
def build_conseqs(conseqs_file, run, sample_sheet, ok_sample_regions): """ Parses a Pipeline-produced conseq file and builds JSON objects to send to QAI. @param conseqs_file: An open file that contains the consensus sequences from the counts2csf step for all samples in the run. @param run: a hash with the attributes of the run record, including a sequencing summary of all the samples and their target projects @param sample_sheet: The data parsed from the sample sheet. @param ok_sample_regions: A set of (sample_name, region, qcut) tuples that were given a good score by the pipeline. @return an array of JSON hashes, one for each conseq. """ result = [] ss = sample_sheet sequencings = run['sequencing_summary'] conseqs_csv = csv.DictReader(conseqs_file) # ss["Data"] is keyed by (what should be) the FASTQ # filename, which looks like # # [sample name with ; and _ replaced by -]_S[sample number]. # # Meanwhile, entries in conseqs_file have a "sample" field holding # just the sample name (also with ; and _ replaced). We make a # lookup table to get the FASTQ filename just from the first part. # This will make subsequent steps easier (avoids having to do a # search through a list/dict of dicts). # FASTQ_lookup = {} # filename_re = re.compile("(.+)_S.+") # for fastq_filename in ss["Data"]: # sample_name = filename_re.match(fastq_filename).group(1) # FASTQ_lookup[sample_name] = fastq_filename projects = ProjectConfig.loadDefault() target_regions = set() # set([(project_name, tags)]) for entry in sequencings: seeds = projects.getProjectSeeds(entry['target_project']) for seed in seeds: target_regions.add((entry['tag'], seed)) for row in conseqs_csv: # Each row of this file looks like: # sample,region,q-cutoff,s-number,consensus-percent-cutoff,sequence # We want to take the "sample" entry and get the corresponding # original Sample_Name from the sample sheet. In version 2, this # looks like [sample name]~[project name]#[...] # In version 1, this looked like [sample name]~[project name]#[...] # but both ; and _ got garbled by the MiSeq instrument itself. # Thus we have to work around it. fastq_filename = row["sample"] sample_info = ss["Data"][fastq_filename] orig_sample_name = sample_info["orig_sample_name"] sample_tags = sample_info["tags"] # FIXME if row["sequence"] is blank we replace it with a dash. # Need Conan to make that row blank-able. curr_seq = row["sequence"] if len(row["sequence"]) > 0 else "-" sample_region = (fastq_filename, row["region"], row["q-cutoff"]) ok_region = sample_region in ok_sample_regions is_target_region = (sample_tags, row["region"]) in target_regions ok_for_release = ok_region and is_target_region result.append({"samplename": orig_sample_name, # July 9, 2014: we can't do this properly right now # without a lookup table that is yet to be fully # defined. "testcode": None, "conseq_cutoff": row["consensus-percent-cutoff"], "region": row["region"], "qcutoff": float(row["q-cutoff"]), "snum": fastq_filename.split('_')[-1], "seq": curr_seq, "ok_for_release": ok_for_release}) return result
class CoveragePlotsTest(TestCase): def setUp(self): self.addTypeEqualityFunc(str, self.assertMultiLineEqual) config_json = StringIO("""\ { "projects": { "R1": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] } ] }, "R1-and-R2": { "max_variants": 0, "regions": [ { "coordinate_region": "R1", "coordinate_region_length": 3, "key_positions": [ { "end_pos": null, "start_pos": 1 }, { "end_pos": null, "start_pos": 3 } ], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R1-seed" ] }, { "coordinate_region": "R2", "coordinate_region_length": 1, "key_positions": [], "min_coverage1": 10, "min_coverage2": 50, "min_coverage3": 100, "seed_region_names": [ "R2-seed" ] } ] } } } """) self.config = ProjectConfig() self.config.load(config_json) @patch('matplotlib.pyplot.savefig') @patch('micall.core.project_config.ProjectConfig.loadScoring') def test_simple(self, config_mock, savefig_mock): config_mock.return_value = self.config amino_csv = StringIO("""\ seed,region,q-cutoff,query.aa.pos,refseq.aa.pos,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,* R1-seed,R1,15,100,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 R1-seed,R1,15,101,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0 R1-seed,R1,15,102,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0 """) expected_scores = """\ project,region,seed,q.cut,min.coverage,which.key.pos,off.score,on.score R1,R1,R1-seed,15,5,1,-1,1 R1-and-R2,R1,R1-seed,15,5,1,-1,1 """ scores_csv = StringIO() amino_csv.name = 'E1234.amino.csv' expected_calls = [ call('E1234.R1.R1.png'), call('E1234.R1-and-R2.R1.png') ] coverage_plot(amino_csv, coverage_scores_csv=scores_csv) self.assertEqual(expected_calls, savefig_mock.mock_calls) self.assertEqual(expected_scores, scores_csv.getvalue())
def main(): args = parse_args() projects = ProjectConfig.loadDefault() for sample_name in args.sample: process_file(sample_name, projects, args) print('Done.')
def main(): args = parse_args() project_config = ProjectConfig.loadDefault() with open('../project_scoring.json', 'rU') as scoring_file: scoring_config = json.load(scoring_file) with qai_helper.Session() as session: session.login(args.qai_server, args.qai_user, args.qai_password) pipelines = session.get_json( "/lab_miseq_pipelines?version=" + args.pipeline_version, retries=0) if pipelines: raise RuntimeError('Pipeline {} already exists.'.format( args.pipeline_version)) seed_groups = session.get_json("/lab_miseq_seed_groups") seed_group_ids = dict(map(itemgetter('name', 'id'), seed_groups)) old_regions = session.get_json("/lab_miseq_regions", retries=0) regions = dict(((region['name'], region) for region in old_regions)) for region_name, region_data in project_config.config['regions'].items(): region = regions.get(region_name) if region is None: seed_group_name = region_data['seed_group'] seed_group_id = seed_group_ids.get(seed_group_name) if seed_group_id is None and seed_group_name: seed_group = session.post_json("/lab_miseq_seed_groups", {'name': seed_group_name}) seed_group_id = seed_group['id'] seed_group_ids[seed_group_name] = seed_group_id region = session.post_json( "/lab_miseq_regions", {'name': region_name, 'is_nucleotide': region_data['is_nucleotide'], 'reference': ''.join(region_data['reference']), 'seed_group_id': seed_group_id}) regions[region_name] = region pipeline = session.post_json("/lab_miseq_pipelines", {'version': args.pipeline_version}) pipeline_id = pipeline['id'] old_projects = session.get_json("/lab_miseq_projects", retries=0) projects = dict(((project['name'], project) for project in old_projects)) for project_name, project_data in project_config.config['projects'].items(): project = projects.get(project_name) if project is None: project = session.post_json( "/lab_miseq_projects", {'name': project_name, 'max_variants': project_data['max_variants']}) project_version = session.post_json("/lab_miseq_project_versions", {'pipeline_id': pipeline_id, 'project_id': project['id']}) for i, region_data in enumerate(project_data['regions']): scoring_data = scoring_config['projects'][project_name]['regions'][i] coordinate_region = regions[region_data['coordinate_region']] seed_region = regions[region_data['seed_region_names'][0]] seed_group_id = seed_region['seed_group_id'] project_region = session.post_json( "/lab_miseq_project_regions", {'project_version_id': project_version['id'], 'coordinate_region_id': coordinate_region['id'], 'min_coverage1': scoring_data['min_coverage1'], 'min_coverage2': scoring_data['min_coverage2'], 'min_coverage3': scoring_data['min_coverage3'], 'seed_group_id': seed_group_id}) for key_position in scoring_data['key_positions']: session.post_json("/lab_miseq_key_positions", {'project_region_id': project_region['id'], 'start_pos': key_position['start_pos'], 'end_pos': key_position['end_pos']}) print("Done.")
class ConvertPrelimTest(unittest.TestCase): def setUp(self): self.projects = ProjectConfig() self.projects.load( StringIO("""\ { "regions": { "R1-seed": { "seed_group": "main", "reference": ["ACTAAAGGG"] }, "R2-seed": { "seed_group": "main", "reference": ["ACTAAAGGGAAA"] } } } """)) self.sam_file = StringIO() self.remap_counts = StringIO() self.remap_counts_writer = DictWriter( self.remap_counts, ['type', 'filtered_count', 'count'], lineterminator=os.linesep) self.remap_counts_writer.writeheader() def test_simple(self): prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,0,1 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_two_regions(self): prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,9M,=,1,0,AAACCCTTT,BBBBBBBBB example2,89,R2-seed,1,0,9M,=,1,0,AAAACCTTT,BBBBBBBBB example3,89,R2-seed,1,0,9M,=,1,0,AAAAACTTT,BBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t9M\t=\t1\t0\tAAACCCTTT\tBBBBBBBBB example2\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAACCTTT\tBBBBBBBBB example3\t89\tR2-seed\t1\t0\t9M\t=\t1\t0\tAAAAACTTT\tBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,0,1 prelim R2-seed,0,2 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_long_reads(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,2,2 """ expected_seed_counts = {'R1-seed': 2} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_star_region(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3,93,*,*,*,*,*,*,*,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3\t93\t*\t*\t*\t*\t*\t*\t*\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim *,0,1 prelim R1-seed,2,2 """ expected_seed_counts = {'R1-seed': 2} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_best_in_group(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,89,R2-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3,89,R1-seed,1,0,54M,=,1,0,\ AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example4,89,R2-seed,1,0,54M,=,1,0,\ AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example5,89,R2-seed,1,0,54M,=,1,0,\ AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example3\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAATTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example4\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAAAATAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example5\t89\tR2-seed\t1\t0\t54M\t=\t1\t0\t\ AAAAAAAAAAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,2,2 prelim R2-seed,3,3 """ expected_seed_counts = {'R2-seed': 3} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts) def test_unmapped_read(self): self.maxDiff = None prelim_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual example1,89,R1-seed,1,0,54M,=,1,0,\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2,93,R1-seed,1,0,54M,=,1,0,\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT,\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """) count_threshold = 2 expected_sam_file = """\ @HD VN:1.0 SO:unsorted @SQ SN:R1-seed LN:9 @SQ SN:R2-seed LN:12 @PG ID:bowtie2 PN:bowtie2 VN:2.2.3 CL:"" example1\t89\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTTAAACCCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB example2\t93\tR1-seed\t1\t0\t54M\t=\t1\t0\t\ AAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTTAAAACCTTT\t\ BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB """ expected_remap_counts = """\ type,filtered_count,count prelim R1-seed,1,2 """ expected_seed_counts = {} seed_counts = convert_prelim(prelim_csv, self.sam_file, self.remap_counts_writer, count_threshold, self.projects) self.assertEqual(expected_sam_file, self.sam_file.getvalue()) self.assertEqual(expected_remap_counts, self.remap_counts.getvalue()) self.assertEqual(expected_seed_counts, seed_counts)