Esempio n. 1
0
def write_strains_with_HI_and_sequence(flutype='H3N2'):
    HI_titers = read_tables(flutype)
    HI_trevor = read_trevor_table(flutype)
    HI_strains = set(HI_titers.index)
    HI_strains.update(HI_trevor[0])
    from Bio import SeqIO
    good_strains = set()
    with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \
         myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile:
        for seq_rec in SeqIO.parse(infile, 'fasta'):
            tmp_name = seq_rec.description.split('|')[0].strip()
            reduced_name = HI_fix_name(tmp_name)
            if reduced_name in HI_strains and (reduced_name
                                               not in good_strains):
                SeqIO.write(seq_rec, outfile, 'fasta')
                good_strains.add(reduced_name)

    titer_count = defaultdict(int)
    measurements = get_all_titers_flat(flutype)
    for ii, rec in measurements.iterrows():
        test, ref, src_id, val = rec
        titer_count[test] += 1

    with myopen("data/" + flutype + "_HI_strains.txt",
                'w') as HI_strain_outfile:
        for strain, count in sorted(titer_count.items(),
                                    key=lambda x: x[1],
                                    reverse=True):
            HI_strain_outfile.write(strain + '\t' + str(count) + '\n')
            if fix_name(strain) != strain:
                HI_strain_outfile.write(
                    fix_name(strain) + '\t' + str(count) + '\n')
Esempio n. 2
0
def write_strains_with_HI_and_sequence(flutype='H3N2'):
    HI_titers = read_tables(flutype)
    HI_trevor = read_trevor_table(flutype)
    HI_strains = set(HI_titers.index)
    HI_strains.update(HI_trevor[0])
    from Bio import SeqIO
    good_strains = set()
    with myopen("data/"+flutype+"_strains_with_HI.fasta", 'w') as outfile, \
         myopen("data/"+flutype+"_gisaid_epiflu_sequence.fasta", 'r') as infile:
        for seq_rec in SeqIO.parse(infile, 'fasta'):
            tmp_name = seq_rec.description.split('|')[0].strip()
            reduced_name = HI_fix_name(tmp_name)
            if reduced_name in HI_strains and (reduced_name not in good_strains):
                SeqIO.write(seq_rec, outfile,'fasta')
                good_strains.add(reduced_name)

    titer_count = defaultdict(int)
    measurements = get_all_titers_flat(flutype)
    for ii, rec in measurements.iterrows():
        test, ref, src_id, val = rec
        titer_count[test]+=1

    with myopen("data/"+flutype+"_HI_strains.txt", 'w') as HI_strain_outfile:
        for strain, count in sorted(titer_count.items(), key=lambda x:x[1], reverse=True):
            HI_strain_outfile.write(strain + '\t'+str(count)+'\n')
            if fix_name(strain)!=strain:
                HI_strain_outfile.write(fix_name(strain) + '\t'+str(count)+'\n')
Esempio n. 3
0
    def check_sources(self):
        self.source_HIs = defaultdict(dict)
        with myopen(self.HI_fname, 'r') as infile:
            for line in infile:
                test, ref_virus, serum, src_id, val_str = line.strip().split()
                try:
                    val = float(val_str)
                    if not np.isnan(val):
                        self.source_HIs[src_id][test,
                                                (ref_virus,
                                                 serum)] = self.normalize(
                                                     (ref_virus, serum),
                                                     float(val))
                except:
                    print(test, ref_virus, serum, src_id, float(val))

        self.source_validation = {}
        for src_id in self.source_HIs:
            print('\n############### \n', src_id, '\n############### \n')
            print("number of measurements:", len(self.source_HIs[src_id]))
            try:
                n_checks = self.validate(
                    validation_set=self.source_HIs[src_id],
                    incl_ref_strains='no')
                self.source_validation[src_id] = [
                    self.abs_error, self.rms_error, self.slope, self.intercept,
                    n_checks
                ]
            except:
                print("skipped due to too few measurements")
Esempio n. 4
0
def write_flat_HI_titers(flutype = 'H3N2', fname = None):
    measurements = get_all_titers_flat(flutype)
    with myopen('data/'+flutype+'_HI_strains.txt') as infile:
        strains = [HI_fix_name(line.strip().split('\t')[0]).upper() for line in infile]
    if fname is None:
        fname = 'data/'+flutype+'_HI_titers.txt'
    written = 0
    skipped = 0
    with myopen(fname, 'w') as outfile:
        for ii, rec in measurements.iterrows():
            test, ref, src_id, val = rec
            if HI_fix_name(test).upper() in strains and HI_fix_name(rec[1][0]).upper() in strains:
                outfile.write('\t'.join(map(str, [test, ref[0], ref[1], src_id, val]))+'\n')
                written+=1
            else:
                skipped+=1
    print "written",written,"records"
    print "skipped",skipped,"records"
Esempio n. 5
0
 def load(self):
     from cPickle import load
     for attr_name, fname in self.file_dumps.iteritems():
         if os.path.isfile(fname):
             with myopen(fname, 'r') as ifile:
                 if attr_name == 'tree':
                     continue
                 else:
                     setattr(self, attr_name, load(ifile))
     fname = self.file_dumps['tree']
     if os.path.isfile(fname):
         self.build_tree(fname)
Esempio n. 6
0
 def dump(self):
     from cPickle import dump
     from Bio import Phylo
     for attr_name, fname in self.file_dumps.iteritems():
         if hasattr(self, attr_name):
             print("dumping", attr_name)
             if attr_name == 'seqs': self.seqs.raw_seqs = None
             with myopen(fname, 'wb') as ofile:
                 if attr_name == 'tree':
                     Phylo.write(self.tree.tree, ofile, 'newick')
                 else:
                     dump(getattr(self, attr_name), ofile, -1)
Esempio n. 7
0
 def dump(self):
     from cPickle import dump
     from Bio import Phylo
     for attr_name, fname in self.file_dumps.iteritems():
         if hasattr(self,attr_name):
             print("dumping",attr_name)
             if attr_name=='seqs': self.seqs.raw_seqs = None
             with myopen(fname, 'wb') as ofile:
                 if attr_name=='tree':
                     Phylo.write(self.tree.tree, ofile, 'newick')
                 else:
                     dump(getattr(self,attr_name), ofile, -1)
Esempio n. 8
0
 def load(self):
     from cPickle import load
     for attr_name, fname in self.file_dumps.iteritems():
         if os.path.isfile(fname):
             with myopen(fname, 'r') as ifile:
                 if attr_name=='tree':
                     continue
                 else:
                     setattr(self, attr_name, load(ifile))
     fname = self.file_dumps['tree']
     if os.path.isfile(fname):
         self.build_tree(fname)
Esempio n. 9
0
def write_flat_HI_titers(flutype='H3N2', fname=None):
    measurements = get_all_titers_flat(flutype)
    with myopen('data/' + flutype + '_HI_strains.txt') as infile:
        strains = [
            HI_fix_name(line.strip().split('\t')[0]).upper() for line in infile
        ]
    if fname is None:
        fname = 'data/' + flutype + '_HI_titers.txt'
    written = 0
    skipped = 0
    with myopen(fname, 'w') as outfile:
        for ii, rec in measurements.iterrows():
            test, ref, src_id, val = rec
            if HI_fix_name(test).upper() in strains and HI_fix_name(
                    rec[1][0]).upper() in strains:
                outfile.write(
                    '\t'.join(map(str, [test, ref[0], ref[1], src_id, val])) +
                    '\n')
                written += 1
            else:
                skipped += 1
    print "written", written, "records"
    print "skipped", skipped, "records"
Esempio n. 10
0
 def read_HI_titers(self, fname):
     strains = set()
     measurements = defaultdict(list)
     sources = set()
     with myopen(fname, 'r') as infile:
         for line in infile:
             entries = line.strip().split()
             test, ref_virus, serum, src_id, val = (entries[0], entries[1],
                                                    entries[2], entries[3],
                                                    float(entries[4]))
             ref = (ref_virus, serum)
             if src_id not in self.excluded_tables:
                 try:
                     measurements[(test, (ref_virus, serum))].append(val)
                     strains.update([test, ref])
                     sources.add(src_id)
                 except:
                     print(line.strip())
     return measurements, strains, sources
Esempio n. 11
0
def read_trevor_table(flutype):
    trevor_table = 'data/'+flutype+'_HI.tsv'
    import csv
    measurements = []
    sera = set()
    strains = set()
    if os.path.isfile(trevor_table):
        with myopen(trevor_table) as infile:
            table_reader = csv.reader(infile, delimiter="\t")
            header = table_reader.next()
            for row in table_reader:
                val = titer_to_number(row[6])
                if not np.isnan(val):
                    strains.add(HI_fix_name(row[1]))
                    serum = (HI_fix_name(row[4]), row[3])
                    src_id = row[-1]
                    sera.add(serum)
                    measurements.append([HI_fix_name(row[1]), serum, src_id, val])
    else:
        print trevor_table, "not found"
    print "trevor total:", len(measurements), "measurements"
    return strains, sera, pd.DataFrame(measurements)
Esempio n. 12
0
def read_trevor_table(flutype):
    trevor_table = 'data/' + flutype + '_HI.tsv'
    import csv
    measurements = []
    sera = set()
    strains = set()
    if os.path.isfile(trevor_table):
        with myopen(trevor_table) as infile:
            table_reader = csv.reader(infile, delimiter="\t")
            header = table_reader.next()
            for row in table_reader:
                val = titer_to_number(row[6])
                if not np.isnan(val):
                    strains.add(HI_fix_name(row[1]))
                    serum = (HI_fix_name(row[4]), row[3])
                    src_id = row[-1]
                    sera.add(serum)
                    measurements.append(
                        [HI_fix_name(row[1]), serum, src_id, val])
    else:
        print trevor_table, "not found"
    print "trevor total:", len(measurements), "measurements"
    return strains, sera, pd.DataFrame(measurements)
Esempio n. 13
0
def parse_HI_matrix(fname):
    from string import strip
    import csv
    name_abbrev = {
        'HK': "HONGKONG",
        'SWITZ': "SWITZERLAND",
        'VIC': "VICTORIA",
        'STOCK': "STOCKHOLM",
        'STHAFR': "SOUTHAFRICA",
        'SAFRICA': "SOUTHAFRICA",
        "ENG": "ENGLAND",
        "NIB-85": "A/ALMATY/2958/2013",
        'NOR': 'NORWAY',
        'NTHCAROL': "NORTHCAROLINA",
        'ALA': "ALABAMA",
        'NY': "NEWYORK",
        "GLAS": "GLASGOW",
        "AL": "ALABAMA",
        "NETH": "NETHERLANDS",
        "FIN": "FINLAND",
        "BRIS": "BRISBANE",
        "MARY": "MARYLAND",
        "ST.P'BURG": "ST.PETERSBURG",
        'CAL': 'CALIFORNIA',
        'AUCK': 'AUCKLAND',
        "C'CHURCH": 'CHRISTCHURCH',
        'CHCH': 'CHRISTCHURCH',
        'ASTR': 'ASTRAKHAN',
        'ASTRAK': 'ASTRAKHAN',
        'ST.P': "ST.PETERSBURG",
        'ST P': "ST.PETERSBURG",
        'STP': "ST.PETERSBURG",
        'JHB': 'JOHANNESBURG',
        'FOR': 'FORMOSA',
        'MAL': 'MALAYSIA',
        'STHAUS': 'SOUTHAUSTRALIA',
        'FL': 'FLORIDA',
        'MASS': 'MASSACHUSETTS',
        'NOVO': 'NOVOSIBIRSK',
        'WIS': 'WISCONSIN',
        'BANG': 'BANGLADESH',
        'EG': 'EGYPT'
    }
    src_id = fname.split('/')[-1]
    print fname
    with myopen(fname) as infile:
        csv_reader = csv.reader(infile)

        # parse sera
        row1 = csv_reader.next()
        row2 = csv_reader.next()
        row3 = csv_reader.next()
        ref_sera = [[HI_fix_name(e1 + '/' + e2),
                     e3.replace(' ', '')]
                    for e1, e2, e3 in zip(row1, row2, row3)[4:]]
        for ri in xrange(len(ref_sera)):
            abbr = ref_sera[ri][0].split('/')[1].rstrip('01234566789')
            if abbr in name_abbrev:
                ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0].replace(
                    abbr, name_abbrev[abbr]))
            else:
                ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0])
            # strip numbers
            tmp = ref_sera[ri][0].split('/')
            ref_sera[ri][0] = '/'.join([tmp[0], tmp[1].rstrip('0123456789')] +
                                       tmp[2:])
            try:
                y = int(ref_sera[ri][0].split('/')[-1])
                if y < 100:
                    if y < 20:
                        ref_sera[ri][0] = '/'.join(
                            ref_sera[ri][0].split('/')[:-1]) + '/' + str(2000 +
                                                                         y)
                    else:
                        ref_sera[ri][0] = '/'.join(
                            ref_sera[ri][0].split('/')[:-1]) + '/' + str(1900 +
                                                                         y)
            except:
                print ref_sera[ri]

        fields = [
            'source', 'ref/test', 'genetic group', 'collection date',
            'passage history'
        ] + map(tuple, ref_sera)
        #print fields
        for row in csv_reader:  # advance until the reference virus
            if row[0].startswith('REFERENCE'):
                break

        ref_strains = []
        ref_matrix = []
        for row in csv_reader:
            if row[0].startswith('TEST'):
                break
            else:  # load matrices until the test virus section starts
                ref_strains.append(HI_fix_name(row[0].strip()))
                ref_matrix.append([src_id, 'ref'] + map(strip, row[1:4]) +
                                  map(titer_to_number, row[4:]))

        test_strains = []
        test_matrix = []
        for row in csv_reader:  # load test viruses until it is no longer an A/ flu  name
            if not (row[0].startswith('A/') or row[0].startswith('B/')):
                break
            else:
                test_strains.append(HI_fix_name(row[0].strip()))
                test_matrix.append([src_id, 'test'] + map(strip, row[1:4]) +
                                   map(titer_to_number, row[4:]))

        print len(ref_sera), ref_sera
        print len(ref_strains), len(test_strains)
        HI_table = pd.DataFrame(ref_matrix + test_matrix,
                                index=ref_strains + test_strains,
                                columns=fields)

        return HI_table
Esempio n. 14
0
def parse_HI_matrix(fname):
    from string import strip
    import csv
    name_abbrev = {'HK':"HONGKONG", 'SWITZ':"SWITZERLAND", 'VIC':"VICTORIA", 'STOCK':"STOCKHOLM",
                    'STHAFR':"SOUTHAFRICA", 'SAFRICA':"SOUTHAFRICA", "ENG":"ENGLAND", "NIB-85":"A/ALMATY/2958/2013", 'NOR':'NORWAY',
                    'NTHCAROL':"NORTHCAROLINA",'ALA':"ALABAMA", 'NY':"NEWYORK", "GLAS":"GLASGOW", "AL":"ALABAMA",
                    "NETH":"NETHERLANDS", "FIN":"FINLAND", "BRIS":"BRISBANE", "MARY":"MARYLAND",
                    "ST.P'BURG":"ST.PETERSBURG", 'CAL':'CALIFORNIA', 'AUCK':'AUCKLAND', "C'CHURCH":'CHRISTCHURCH',
                    'CHCH':'CHRISTCHURCH', 'ASTR':'ASTRAKHAN', 'ASTRAK':'ASTRAKHAN', 'ST.P':"ST.PETERSBURG",'ST P':"ST.PETERSBURG",'STP':"ST.PETERSBURG",
                    'JHB':'JOHANNESBURG', 'FOR':'FORMOSA','MAL':'MALAYSIA', 'STHAUS':'SOUTHAUSTRALIA',
                    'FL':'FLORIDA', 'MASS':'MASSACHUSETTS','NOVO':'NOVOSIBIRSK','WIS':'WISCONSIN','BANG':'BANGLADESH','EG':'EGYPT'  }
    src_id = fname.split('/')[-1]
    print fname
    with myopen(fname) as infile:
        csv_reader = csv.reader(infile)

        # parse sera
        row1 = csv_reader.next()
        row2 = csv_reader.next()
        row3 = csv_reader.next()
        ref_sera = [[HI_fix_name(e1+'/'+e2), e3.replace(' ','')] for e1,e2,e3 in zip(row1, row2, row3)[4:]]
        for ri in xrange(len(ref_sera)):
            abbr = ref_sera[ri][0].split('/')[1].rstrip('01234566789')
            if abbr in name_abbrev:
                ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0].replace(abbr, name_abbrev[abbr]))
            else:
                ref_sera[ri][0] = HI_fix_name(ref_sera[ri][0])
            # strip numbers
            tmp = ref_sera[ri][0].split('/')
            ref_sera[ri][0] = '/'.join([tmp[0], tmp[1].rstrip('0123456789')]+tmp[2:])
            try:
                y = int(ref_sera[ri][0].split('/')[-1])
                if y<100:
                    if y<20:
                        ref_sera[ri][0] = '/'.join(ref_sera[ri][0].split('/')[:-1])+'/'+str(2000+y)
                    else:
                        ref_sera[ri][0] = '/'.join(ref_sera[ri][0].split('/')[:-1])+'/'+str(1900+y)
            except:
                print ref_sera[ri]

        fields = ['source','ref/test', 'genetic group', 'collection date', 'passage history']+map(tuple, ref_sera)
        #print fields
        for row in csv_reader: # advance until the reference virus
            if row[0].startswith('REFERENCE'):
                break

        ref_strains = []
        ref_matrix = []
        for row in csv_reader:
            if row[0].startswith('TEST'):
                break
            else: # load matrices until the test virus section starts
                ref_strains.append(HI_fix_name(row[0].strip()))
                ref_matrix.append([src_id,'ref']+map(strip, row[1:4])+map(titer_to_number, row[4:]))

        test_strains = []
        test_matrix = []
        for row in csv_reader: # load test viruses until it is no longer an A/ flu  name
            if not (row[0].startswith('A/') or row[0].startswith('B/')):
                break
            else:
                test_strains.append(HI_fix_name(row[0].strip()))
                test_matrix.append([src_id,'test']+map(strip,row[1:4])+map(titer_to_number, row[4:]))

        print len(ref_sera), ref_sera
        print len(ref_strains), len(test_strains)
        HI_table  = pd.DataFrame(ref_matrix+test_matrix, index = ref_strains+test_strains, columns= fields)

        return HI_table