Esempio n. 1
0
        Verify that there were not ambiguous indices created for different eLife uploads.
        '''
        indices = []
        unique = 0
        nonunique = 0
        uniq = True
        for meas in measurements:
            index_string = ''
            for field in meas['index']:
                index_string = index_string + str(field)
            if index_string in indices:
                print "Nonunique index field: ", index_string
                nonunique += 1
                uniq = False
            else:
                indices.append(index_string)
                unique += 1
        print "Unique fields: ", unique
        print "Nonunique fields: ", nonunique
        return uniq


if __name__ == "__main__":
    args = parser.parse_args()
    if args.path is None:
        args.path = "data/"
    if not os.path.isdir(args.path):
        os.makedirs(args.path)
    connTDB = elife_upload(**args.__dict__)
    connTDB.upload(**args.__dict__)
Esempio n. 2
0
                SeqIO.write([oseq, record], "temp_in.fasta", "fasta")
                os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp")
                tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta'))
                scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum()))
            scores.sort(key = lambda x:x[1], reverse=True)
            if scores[0][1]>min_score_percentage*len(record.seq):
                print("Lineage based on similarity:", scores[0][0], doc['strain'], len(record.seq), scores)
                return self.outgroup_patterns[scores[0][0]]
            else:
                print("Couldn't parse virus subtype and lineage from aligning sequence: ", doc['strain'], len(record.seq), scores)
                return None
        except:
            print("Alignment failed: " + doc['strain'])
            return None

if __name__=="__main__":
    args = parser.parse_args()
    sequence_fasta_fields = {0: 'accession', 1: 'strain', 2: 'isolate_id', 3:'locus', 4: 'passage', 5: 'submitting_lab'}
    #              >>B/Austria/896531/2016  | EPI_ISL_206054 | 687738 | HA | Siat 1
    setattr(args, 'fasta_fields', sequence_fasta_fields)
    xls_fields_wanted = [('strain', 'Isolate_Name'), ('isolate_id', 'Isolate_Id'), ('collection_date', 'Collection_Date'),
                             ('host', 'Host'), ('Subtype', 'Subtype'), ('Lineage', 'Lineage'),
                             ('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'),
                             ('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date')]
    setattr(args, 'xls_fields_wanted', xls_fields_wanted)
    if args.path is None:
        args.path = "data/"
    if not os.path.isdir(args.path):
        os.makedirs(args.path)
    connVDB = flu_upload(**args.__dict__)
    connVDB.upload(**args.__dict__)
Esempio n. 3
0
	def separate_viruses_sequences(self, data, **kwargs):
		viruses = []
		sequences = []
		for record in data:
			v = {k: v for k,v in record.items() if k in virus_attribs} # defined in __main__ below
			s = {k: v for k,v in record.items() if k in sequence_attribs}
			v = self.add_virus_fields(v, **kwargs) # add attributes specified at command line, and universal fields like 'number of sequences'
			s = self.add_sequence_fields(s, **kwargs)
			sequences.append(s)
			viruses.append(v)
		return (viruses, sequences)


if __name__=="__main__":
	args = parser.parse_args() # parser is an argparse object initiated in parse.py
	virus_attribs = ['strain', 'original_strain', 'virus', 'serotype','collection_date', 'region', 'country', 'division', 'location'] # define fields in fasta headers that you want used in parse.py > parse > parse_fasta_file ---> (viruses, sequences)
	sequence_attribs = ['accession', 'strain', 'original_strain', 'virus', 'serotype',  'locus', 'sequence', 'authors', 'PMID', 'source', 'gene_list']
	if args.fname == None:
		setattr(args, 'fname', 'results.tbl')
		setattr(args, 'ftype', 'tsv')
	if args.virus == None:
		setattr(args, 'virus', 'dengue')
	if args.database == None:
		setattr(args, 'database', 'vdb')

	setattr(args, 'virus_attribs', virus_attribs)
	setattr(args, 'sequence_attribs', sequence_attribs)

	connVDB = dengue_upload(**args.__dict__)
	connVDB.upload(**args.__dict__)
Esempio n. 4
0
	def separate_viruses_sequences(self, data, **kwargs):
		viruses = []
		sequences = []
		for record in data:
			v = {k: v for k,v in record.items() if k in virus_attribs} # defined in __main__ below
			s = {k: v for k,v in record.items() if k in sequence_attribs}
			v = self.add_virus_fields(v, **kwargs) # add attributes specified at command line, and universal fields like 'number of sequences'
			s = self.add_sequence_fields(s, **kwargs)
			sequences.append(s)
			viruses.append(v)
		return (viruses, sequences)


if __name__=="__main__":
	args = parser.parse_args() # parser is an argparse object initiated in parse.py
	virus_attribs = ['strain', 'original_strain', 'virus', 'serotype','collection_date', 'region', 'country', 'division', 'location'] # define fields in fasta headers that you want used in parse.py > parse > parse_fasta_file ---> (viruses, sequences)
	sequence_attribs = ['accession', 'strain', 'original_strain', 'virus', 'serotype',  'locus', 'sequence', 'authors', 'PMID', 'source', 'gene_list']
	if args.fname == None:
		setattr(args, 'fname', 'results.tbl')
		setattr(args, 'ftype', 'tsv')
	if args.virus == None:
		setattr(args, 'virus', 'dengue')
	if args.database == None:
		setattr(args, 'database', 'vdb')

	setattr(args, 'virus_attribs', virus_attribs)
	setattr(args, 'sequence_attribs', sequence_attribs)

	connVDB = dengue_upload(**args.__dict__)
	connVDB.upload(**args.__dict__)