def write_strains_with_HI_and_sequence(flutype='H3N2'): HI_titers = read_tables(flutype) HI_trevor = read_trevor_table(flutype) HI_strains = set(HI_titers.index) HI_strains.update(HI_trevor[0]) from Bio import SeqIO good_strains = set() with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \ myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile: for seq_rec in SeqIO.parse(infile, 'fasta'): tmp_name = seq_rec.description.split('|')[0].strip() reduced_name = HI_fix_name(tmp_name) if reduced_name in HI_strains and (reduced_name not in good_strains): SeqIO.write(seq_rec, outfile, 'fasta') good_strains.add(reduced_name) titer_count = defaultdict(int) measurements = get_all_titers_flat(flutype) for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec titer_count[test] += 1 with myopen("data/" + flutype + "_HI_strains.txt", 'w') as HI_strain_outfile: for strain, count in sorted(titer_count.items(), key=lambda x: x[1], reverse=True): HI_strain_outfile.write(strain + '\t' + str(count) + '\n') if fix_name(strain) != strain: HI_strain_outfile.write( fix_name(strain) + '\t' + str(count) + '\n')
def write_strains_with_HI_and_sequence(flutype='H3N2'): HI_titers = read_tables(flutype) HI_trevor = read_trevor_table(flutype) HI_strains = set(HI_titers.index) HI_strains.update(HI_trevor[0]) from Bio import SeqIO good_strains = set() with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \ myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile: for seq_rec in SeqIO.parse(infile, 'fasta'): tmp_name = seq_rec.description.split('|')[0].strip() reduced_name = HI_fix_name(tmp_name) if reduced_name in HI_strains and (reduced_name not in good_strains): SeqIO.write(seq_rec, outfile,'fasta') good_strains.add(reduced_name) titer_count = defaultdict(int) measurements = get_all_titers_flat(flutype) for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec titer_count[test]+=1 with myopen("data/"+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile: for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True): HI_strain_outfile.write(strain + '\t'+str(count)+'\n') if fix_name(strain)!=strain: HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
def check_sources(self): self.source_HIs = defaultdict(dict) with myopen(self.HI_fname, 'r') as infile: for line in infile: test, ref_virus, serum, src_id, val_str = line.strip().split() try: val = float(val_str) if not np.isnan(val): self.source_HIs[src_id][test, (ref_virus, serum)] = self.normalize( (ref_virus, serum), float(val)) except: print(test, ref_virus, serum, src_id, float(val)) self.source_validation = {} for src_id in self.source_HIs: print('\n############### \n', src_id, '\n############### \n') print("number of measurements:", len(self.source_HIs[src_id])) try: n_checks = self.validate( validation_set=self.source_HIs[src_id], incl_ref_strains='no') self.source_validation[src_id] = [ self.abs_error, self.rms_error, self.slope, self.intercept, n_checks ] except: print("skipped due to too few measurements")
def write_flat_HI_titers(flutype = 'H3N2', fname = None): measurements = get_all_titers_flat(flutype) with myopen('data/'+flutype+'_HI_strains.txt') as infile: strains = [HI_fix_name(line.strip().split('\t')[0]).upper() for line in infile] if fname is None: fname = 'data/'+flutype+'_HI_titers.txt' written = 0 skipped = 0 with myopen(fname, 'w') as outfile: for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec if HI_fix_name(test).upper() in strains and HI_fix_name(rec[1][0]).upper() in strains: outfile.write('\t'.join(map(str, [test, ref[0], ref[1], src_id, val]))+'\n') written+=1 else: skipped+=1 print "written",written,"records" print "skipped",skipped,"records"
def load(self): from cPickle import load for attr_name, fname in self.file_dumps.iteritems(): if os.path.isfile(fname): with myopen(fname, 'r') as ifile: if attr_name == 'tree': continue else: setattr(self, attr_name, load(ifile)) fname = self.file_dumps['tree'] if os.path.isfile(fname): self.build_tree(fname)
def dump(self): from cPickle import dump from Bio import Phylo for attr_name, fname in self.file_dumps.iteritems(): if hasattr(self, attr_name): print("dumping", attr_name) if attr_name == 'seqs': self.seqs.raw_seqs = None with myopen(fname, 'wb') as ofile: if attr_name == 'tree': Phylo.write(self.tree.tree, ofile, 'newick') else: dump(getattr(self, attr_name), ofile, -1)
def dump(self): from cPickle import dump from Bio import Phylo for attr_name, fname in self.file_dumps.iteritems(): if hasattr(self,attr_name): print("dumping",attr_name) if attr_name=='seqs': self.seqs.raw_seqs = None with myopen(fname, 'wb') as ofile: if attr_name=='tree': Phylo.write(self.tree.tree, ofile, 'newick') else: dump(getattr(self,attr_name), ofile, -1)
def load(self): from cPickle import load for attr_name, fname in self.file_dumps.iteritems(): if os.path.isfile(fname): with myopen(fname, 'r') as ifile: if attr_name=='tree': continue else: setattr(self, attr_name, load(ifile)) fname = self.file_dumps['tree'] if os.path.isfile(fname): self.build_tree(fname)
def write_flat_HI_titers(flutype='H3N2', fname=None): measurements = get_all_titers_flat(flutype) with myopen('data/' + flutype + '_HI_strains.txt') as infile: strains = [ HI_fix_name(line.strip().split('\t')[0]).upper() for line in infile ] if fname is None: fname = 'data/' + flutype + '_HI_titers.txt' written = 0 skipped = 0 with myopen(fname, 'w') as outfile: for ii, rec in measurements.iterrows(): test, ref, src_id, val = rec if HI_fix_name(test).upper() in strains and HI_fix_name( rec[1][0]).upper() in strains: outfile.write( '\t'.join(map(str, [test, ref[0], ref[1], src_id, val])) + '\n') written += 1 else: skipped += 1 print "written", written, "records" print "skipped", skipped, "records"
def read_HI_titers(self, fname): strains = set() measurements = defaultdict(list) sources = set() with myopen(fname, 'r') as infile: for line in infile: entries = line.strip().split() test, ref_virus, serum, src_id, val = (entries[0], entries[1], entries[2], entries[3], float(entries[4])) ref = (ref_virus, serum) if src_id not in self.excluded_tables: try: measurements[(test, (ref_virus, serum))].append(val) strains.update([test, ref]) sources.add(src_id) except: print(line.strip()) return measurements, strains, sources
def read_trevor_table(flutype): trevor_table = 'data/'+flutype+'_HI.tsv' import csv measurements = [] sera = set() strains = set() if os.path.isfile(trevor_table): with myopen(trevor_table) as infile: table_reader = csv.reader(infile, delimiter="\t") header = table_reader.next() for row in table_reader: val = titer_to_number(row[6]) if not np.isnan(val): strains.add(HI_fix_name(row[1])) serum = (HI_fix_name(row[4]), row[3]) src_id = row[-1] sera.add(serum) measurements.append([HI_fix_name(row[1]), serum, src_id, val]) else: print trevor_table, "not found" print "trevor total:", len(measurements), "measurements" return strains, sera, pd.DataFrame(measurements)
def read_trevor_table(flutype): trevor_table = 'data/' + flutype + '_HI.tsv' import csv measurements = [] sera = set() strains = set() if os.path.isfile(trevor_table): with myopen(trevor_table) as infile: table_reader = csv.reader(infile, delimiter="\t") header = table_reader.next() for row in table_reader: val = titer_to_number(row[6]) if not np.isnan(val): strains.add(HI_fix_name(row[1])) serum = (HI_fix_name(row[4]), row[3]) src_id = row[-1] sera.add(serum) measurements.append( [HI_fix_name(row[1]), serum, src_id, val]) else: print trevor_table, "not found" print "trevor total:", len(measurements), "measurements" return strains, sera, pd.DataFrame(measurements)
def parse_HI_matrix(fname): from string import strip import csv name_abbrev = { 'HK': "HONGKONG", 'SWITZ': "SWITZERLAND", 'VIC': "VICTORIA", 'STOCK': "STOCKHOLM", 'STHAFR': "SOUTHAFRICA", 'SAFRICA': "SOUTHAFRICA", "ENG": "ENGLAND", "NIB-85": "A/ALMATY/2958/2013", 'NOR': 'NORWAY', 'NTHCAROL': "NORTHCAROLINA", 'ALA': "ALABAMA", 'NY': "NEWYORK", "GLAS": "GLASGOW", "AL": "ALABAMA", "NETH": "NETHERLANDS", "FIN": "FINLAND", "BRIS": "BRISBANE", "MARY": "MARYLAND", "ST.P'BURG": "ST.PETERSBURG", 'CAL': 'CALIFORNIA', 'AUCK': 'AUCKLAND', "C'CHURCH": 'CHRISTCHURCH', 'CHCH': 'CHRISTCHURCH', 'ASTR': 'ASTRAKHAN', 'ASTRAK': 'ASTRAKHAN', 'ST.P': "ST.PETERSBURG", 'ST P': "ST.PETERSBURG", 'STP': "ST.PETERSBURG", 'JHB': 'JOHANNESBURG', 'FOR': 'FORMOSA', 'MAL': 'MALAYSIA', 'STHAUS': 'SOUTHAUSTRALIA', 'FL': 'FLORIDA', 'MASS': 'MASSACHUSETTS', 'NOVO': 'NOVOSIBIRSK', 'WIS': 'WISCONSIN', 'BANG': 'BANGLADESH', 'EG': 'EGYPT' } src_id = fname.split('/')[-1] print fname with myopen(fname) as infile: csv_reader = csv.reader(infile) # parse sera row1 = csv_reader.next() row2 = csv_reader.next() row3 = csv_reader.next() ref_sera = [[HI_fix_name(e1 + '/' + e2), e3.replace(' ', '')] for e1, e2, e3 in zip(row1, row2, row3)[4:]] for ri in xrange(len(ref_sera)): abbr = ref_sera[ri][0].split('/')[1].rstrip('01234566789') if abbr in name_abbrev: ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0].replace( abbr, name_abbrev[abbr])) else: ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0]) # strip numbers tmp = ref_sera[ri][0].split('/') ref_sera[ri][0] = '/'.join([tmp[0], tmp[1].rstrip('0123456789')] + tmp[2:]) try: y = int(ref_sera[ri][0].split('/')[-1]) if y < 100: if y < 20: ref_sera[ri][0] = '/'.join( ref_sera[ri][0].split('/')[:-1]) + '/' + str(2000 + y) else: ref_sera[ri][0] = '/'.join( ref_sera[ri][0].split('/')[:-1]) + '/' + str(1900 + y) except: print ref_sera[ri] fields = [ 'source', 'ref/test', 'genetic group', 'collection date', 'passage history' ] + map(tuple, ref_sera) #print fields for row in csv_reader: # advance until the reference virus if row[0].startswith('REFERENCE'): break ref_strains = [] ref_matrix = [] for row in csv_reader: if row[0].startswith('TEST'): break else: # load matrices until the test virus section starts ref_strains.append(HI_fix_name(row[0].strip())) ref_matrix.append([src_id, 'ref'] + map(strip, row[1:4]) + map(titer_to_number, row[4:])) test_strains = [] test_matrix = [] for row in csv_reader: # load test viruses until it is no longer an A/ flu name if not (row[0].startswith('A/') or row[0].startswith('B/')): break else: test_strains.append(HI_fix_name(row[0].strip())) test_matrix.append([src_id, 'test'] + map(strip, row[1:4]) + map(titer_to_number, row[4:])) print len(ref_sera), ref_sera print len(ref_strains), len(test_strains) HI_table = pd.DataFrame(ref_matrix + test_matrix, index=ref_strains + test_strains, columns=fields) return HI_table
def parse_HI_matrix(fname): from string import strip import csv name_abbrev = {'HK':"HONGKONG", 'SWITZ':"SWITZERLAND", 'VIC':"VICTORIA", 'STOCK':"STOCKHOLM", 'STHAFR':"SOUTHAFRICA", 'SAFRICA':"SOUTHAFRICA", "ENG":"ENGLAND", "NIB-85":"A/ALMATY/2958/2013", 'NOR':'NORWAY', 'NTHCAROL':"NORTHCAROLINA",'ALA':"ALABAMA", 'NY':"NEWYORK", "GLAS":"GLASGOW", "AL":"ALABAMA", "NETH":"NETHERLANDS", "FIN":"FINLAND", "BRIS":"BRISBANE", "MARY":"MARYLAND", "ST.P'BURG":"ST.PETERSBURG", 'CAL':'CALIFORNIA', 'AUCK':'AUCKLAND', "C'CHURCH":'CHRISTCHURCH', 'CHCH':'CHRISTCHURCH', 'ASTR':'ASTRAKHAN', 'ASTRAK':'ASTRAKHAN', 'ST.P':"ST.PETERSBURG",'ST P':"ST.PETERSBURG",'STP':"ST.PETERSBURG", 'JHB':'JOHANNESBURG', 'FOR':'FORMOSA','MAL':'MALAYSIA', 'STHAUS':'SOUTHAUSTRALIA', 'FL':'FLORIDA', 'MASS':'MASSACHUSETTS','NOVO':'NOVOSIBIRSK','WIS':'WISCONSIN','BANG':'BANGLADESH','EG':'EGYPT' } src_id = fname.split('/')[-1] print fname with myopen(fname) as infile: csv_reader = csv.reader(infile) # parse sera row1 = csv_reader.next() row2 = csv_reader.next() row3 = csv_reader.next() ref_sera = [[HI_fix_name(e1+'/'+e2), e3.replace(' ','')] for e1,e2,e3 in zip(row1, row2, row3)[4:]] for ri in xrange(len(ref_sera)): abbr = ref_sera[ri][0].split('/')[1].rstrip('01234566789') if abbr in name_abbrev: ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0].replace(abbr, name_abbrev[abbr])) else: ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0]) # strip numbers tmp = ref_sera[ri][0].split('/') ref_sera[ri][0] = '/'.join([tmp[0], tmp[1].rstrip('0123456789')]+tmp[2:]) try: y = int(ref_sera[ri][0].split('/')[-1]) if y<100: if y<20: ref_sera[ri][0] = '/'.join(ref_sera[ri][0].split('/')[:-1])+'/'+str(2000+y) else: ref_sera[ri][0] = '/'.join(ref_sera[ri][0].split('/')[:-1])+'/'+str(1900+y) except: print ref_sera[ri] fields = ['source','ref/test', 'genetic group', 'collection date', 'passage history']+map(tuple, ref_sera) #print fields for row in csv_reader: # advance until the reference virus if row[0].startswith('REFERENCE'): break ref_strains = [] ref_matrix = [] for row in csv_reader: if row[0].startswith('TEST'): break else: # load matrices until the test virus section starts ref_strains.append(HI_fix_name(row[0].strip())) ref_matrix.append([src_id,'ref']+map(strip, row[1:4])+map(titer_to_number, row[4:])) test_strains = [] test_matrix = [] for row in csv_reader: # load test viruses until it is no longer an A/ flu name if not (row[0].startswith('A/') or row[0].startswith('B/')): break else: test_strains.append(HI_fix_name(row[0].strip())) test_matrix.append([src_id,'test']+map(strip,row[1:4])+map(titer_to_number, row[4:])) print len(ref_sera), ref_sera print len(ref_strains), len(test_strains) HI_table = pd.DataFrame(ref_matrix+test_matrix, index = ref_strains+test_strains, columns= fields) return HI_table