def process_file(dfile, access_token, member, metadata, taxonomy): try: verify_ubiome(dfile) tmp_directory = tempfile.mkdtemp() base_filename = dfile['basename'].replace('.zip', '') taxonomy_file = base_filename + '.taxonomy.json' raw_filename = temp_join(tmp_directory, taxonomy_file) metadata = { 'description': 'uBiome 16S taxonomy data, JSON format.', 'tags': ['json', 'uBiome', '16S'] } with open(raw_filename, 'w') as raw_file: json.dump(taxonomy, raw_file) raw_file.flush() api.upload_aws(raw_filename, metadata, access_token, base_url=OH_BASE_URL, project_member_id=str(member['project_member_id'])) except: api.message("uBiome integration: A broken file was deleted", "While processing your uBiome file " "we noticed that your file does not conform " "to the expected specifications and it was " "thus deleted. Email us as [email protected] if " "you think this file should be valid.", access_token, base_url=OH_BASE_URL) api.delete_file(access_token, str(member['project_member_id']), file_id=str(dfile['id']), base_url=OH_BASE_URL) raise
def process_file(dfile, access_token, member, metadata): try: vcf_metadata = verify_vcf(dfile) except: api.message("VCF integration: A broken file was deleted", "While processing your VCF file " "we noticed that your file does not conform " "to the expected specifications and it was " "thus deleted. Email us as [email protected] if " "you think this file should be valid.", access_token, base_url=OH_BASE_URL) api.delete_file(access_token, str(member['project_member_id']), file_id=str(dfile['id']), base_url=OH_BASE_URL) raise try: tmp_directory = tempfile.mkdtemp() base_filename = dfile['basename'] # Save raw 23andMe genotyping to temp file. if base_filename.endswith('.gz'): base_filename = base_filename[0:-3] elif base_filename.endswith('.bz2'): base_filename = base_filename[0:-4] meta_filename = base_filename + '.metadata.json' raw_filename = temp_join(tmp_directory, meta_filename) metadata = {'description': 'VCF file metadata', 'tags': ['vcf']} with open(raw_filename, 'w') as raw_file: json.dump(vcf_metadata, raw_file) raw_file.flush() api.upload_aws(raw_filename, metadata, access_token, base_url=OH_BASE_URL, project_member_id=str(member['project_member_id'])) except: api.message("VCF integration: File could not be uploaded", "Something went wrong when processing your " "file. Please try to upload it again. " "Please email us as [email protected] if " "this keeps happening.", access_token, base_url=OH_BASE_URL) api.delete_file(access_token, str(member['project_member_id']), file_id=str(dfile['id']), base_url=OH_BASE_URL) raise
def test_message_all_members_false_projectmemberid_has_invalid_char(self): with self.assertRaises(Exception): response = message(project_member_ids=['abcdef1', 'test'], subject=SUBJECT, message=MESSAGE, access_token=MASTER_ACCESS_TOKEN) assert response.json() == {"errors": {"project_member_ids": ["Project member IDs are always 8" + " digits long."]}}
def test_message_all_members_false_projectmemberid_has_invalid_digit(self): with self.assertRaises(Exception): response = message(project_member_ids=[INVALID_PMI1, INVALID_PMI2], subject=SUBJECT, message=MESSAGE, access_token=MASTER_ACCESS_TOKEN) assert response.json() == {"errors": {"project_member_ids": ["Invalid project member ID(s):" + " invalidPMI2"]}}
def process_file(dfile, access_token, member, metadata): infile_suffix = dfile['basename'].split(".")[-1] tf_in = tempfile.NamedTemporaryFile(suffix="." + infile_suffix) tf_in.write(requests.get(dfile['download_url']).content) tf_in.flush() tmp_directory = tempfile.mkdtemp() filename_base = 'Location History.json' location_data = get_json(tf_in) if location_data: location_json = json.loads(location_data) output_file = tmp_directory + '/' + filename_base with open(output_file, 'w') as raw_file: json.dump(location_json, raw_file) metadata = { 'description': 'Google Location History JSON', 'tags': ['google location history', 'gps'], 'creation_date': arrow.get().format(), } api.upload_aws(output_file, metadata, access_token, base_url=OH_BASE_URL, project_member_id=str(member['project_member_id'])) get_semantic_data(tf_in, tmp_directory, member, access_token) else: api.message("Google Location History: A broken file was deleted", "While processing your Google Location History file " "we noticed that your file does not conform " "to the expected specifications and it was " "thus deleted. Please make sure you upload " "the right file:\nWe expect the file to be a " "single json file " "or a .zip file as downloaded from Google Takeout." " Please " "do not alter the original file, as unexpected " "additions can invalidate the file.", access_token, base_url=OH_BASE_URL) api.delete_file(access_token, str(member['project_member_id']), file_id=str(dfile['id']), base_url=OH_BASE_URL)
def process_target(data_file, access_token, member, metadata): try: tf = tempfile.NamedTemporaryFile(suffix=".gz") tf_out = tempfile.NamedTemporaryFile(prefix="ftdna-", suffix=".csv", mode="w+b") print("downloading ftdna file from oh") tf.write(requests.get(data_file['download_url']).content) tf.flush() print('read ftdna file') with gzip.open(tf.name, "rt", newline="\n") as ftdna_file: for line in ftdna_file: if valid_line(line): tf_out.write(line.encode('ascii')) tf_out.flush() tf_out.seek(0) print('cleaned file') api.delete_file(access_token, str(member['project_member_id']), file_id=str(data_file['id'])) print('deleted old') upload_new_file(tf_out, access_token, str(member['project_member_id']), data_file['metadata']) except: print('delete broken file') api.delete_file(access_token, str(member['project_member_id']), file_id=str(data_file['id'])) api.message( "A broken file was deleted", "While processing your FamilyTreeDNA file " "we noticed that your file does not conform " "to the expected specifications and it was " "thus deleted. Please make sure you upload " "the right file:\nWe expect the file to be a " "single, - gzipped (ends in .gz) - file as " "you can download from FamilyTreeDNA. Please " "do not alter or unzip this file, as unexpected additions " "also invalidate the file.", access_token)
def test_message_all_members_false_project_member_id_not_none_valid(self): response = message(project_member_ids=[VALID_PMI1, VALID_PMI2], subject=SUBJECT, message=MESSAGE, access_token=ACCESS_TOKEN) self.assertEqual(response.status_code, 200)
def test_message_all_members_true_project_member_id_none(self): response = message(all_members=True, subject=SUBJECT, message=MESSAGE, access_token=ACCESS_TOKEN) self.assertEqual(response.status_code, 200)
def test_message_invalid_access_token(self): response = message(subject=SUBJECT, message=MESSAGE, access_token=ACCESS_TOKEN_INVALID) assert response.json() == {"detail": "Invalid token."}
def test_message_expired_access_token(self): response = message(subject=SUBJECT, message=MESSAGE, access_token=ACCESS_TOKEN_EXPIRED) assert response.json() == {"detail": "Expired token."}
def test_message_valid_access_token(self): response = message(subject=SUBJECT, message=MESSAGE, access_token=ACCESS_TOKEN) self.assertEqual(response.status_code, 200)
def upload_to_oh(oh_id): logger.info('{}: now uploading to OpenHumans'.format(oh_id)) with open('{}/{}/header.txt'.format(OUT_DIR, oh_id), 'r') as headerobj: headiter = takewhile(lambda s: s.startswith('#'), headerobj) header = list(headiter) # construct the final header new_header = [ '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Posterior Probabilities (rounded to 3 digits) for Genotypes 0/0, 0/1 and 1/1">\n', '##INFO=<ID=INFO,Number=1,Type=Float,Description="Impute2 info metric">\n', '##imputerdate={}\n'.format(datetime.date.today().strftime("%m-%d-%y")) ] header.insert(-2, new_header[0]) header.insert(-4, new_header[1]) header.insert(1, new_header[2]) # get all the contig info from the fasta index file fai = 'hg19.fasta.fai' contigs = pd.read_csv('{}/{}'.format(REF_FA, fai), sep='\t', names=['ID', 'length'], usecols=[0, 1]) contigs['ID'] = contigs['ID'].str.replace('chr', '') for row in contigs.itertuples(): chrom = row[1] length = row[2] header.insert(-1, f'##contig=<ID={chrom},length={length}>\n') header = ''.join(header) # combine all vcfs os.chdir(settings.BASE_DIR) with open('{}/{}/member.imputed.vcf'.format(OUT_DIR, oh_id), 'w') as outfile: for chrom in CHROMOSOMES: fname = '{}/{}/chr{}/chr{}/final_impute2/chr{}.member.imputed.vcf'.format( OUT_DIR, oh_id, chrom, chrom, chrom) with open(fname) as infile: for line in infile: outfile.write(line) member_vcf_fp = '{}/{}/member.imputed.vcf'.format(OUT_DIR, oh_id) # add the header to the combined vcf file. with open(member_vcf_fp, 'r') as original: data = original.read() with open(member_vcf_fp, 'w') as modified: modified.write(header + data) # bzip the file with open(member_vcf_fp, 'rb') as input_: with bz2.BZ2File(member_vcf_fp + '.bz2', 'wb', compresslevel=9) as output: copyfileobj(input_, output) # upload file to OpenHumans process_source(oh_id) # Message Member oh_member = OpenHumansMember.objects.get(oh_id=oh_id) project_page = environ.get('OH_ACTIVITY_PAGE') explore_url = 'https://exploratory.openhumans.org/notebook/21/' visualization_url = 'https://exploratory.openhumans.org/notebook/26/' body = "Check {} to see your imputed genotype results from Open Humans.\nVisualize your results here: {}\nFurther explore your results here: {}".format( project_page, visualization_url, explore_url) api.message('Open Humans Imputation Complete', body, oh_member.access_token, project_member_ids=[oh_id]) logger.info('{} emailed member'.format(oh_id)) # check that the vcf file was uploaded. user_details = api.exchange_oauth2_member(oh_member.get_access_token()) username = user_details['username'] imputed = False for data in user_details['data']: if data['basename'] == 'member.imputed.vcf.bz2': imputed = True # alert the sentry admin that the pipeline has completed if imputed: # this is not really any error...properly log me please! logging.error( 'Pipeline finished for member user name: {}, oh_id: {}'.format( username, oh_id)) else: logging.error( 'Error uploading imputed vcf for member -- user name: {}, oh_id: {}' .format(username, oh_id)) # clean users files if not settings.DEBUG: os.chdir(settings.BASE_DIR) clean_command = ['imputer/clean_files.sh', '{}'.format(oh_id)] process = run(clean_command, stdout=PIPE, stderr=PIPE) logger.debug(process.stderr) logger.info('{} finished removing files'.format(oh_id)) imputer_record = ImputerMember.objects.get(oh_id=oh_id, active=True) imputer_record.step = 'complete' imputer_record.active = False imputer_record.save()
def send_first_no_data_email(oh_id, oh_access_token): api.message('[GoogleFit] Data Import: No Data', 'No GoogleFit data was found to import. You need to be using the GoogleFit app on your Android device to collect data.', oh_access_token, project_member_ids=[oh_id])
def send_first_success_email(oh_id, oh_access_token): api.message('[GoogleFit] Data Import: Success', 'Your GoogleFit data was imported successfully to OpenHumans. Go to your dashboard to view: https://googlefit.openhumans.org', oh_access_token, project_member_ids=[oh_id])
def upload_to_oh(oh_id): logger.info('{}: now uploading to OpenHumans'.format(oh_id)) with open('{}/{}/header.txt'.format(OUT_DIR, oh_id), 'r') as headerobj: headiter = takewhile(lambda s: s.startswith('#'), headerobj) header = list(headiter) # construct the final header new_header = [ '##FORMAT=<ID=GP,Number=3,Type=Float,Description="Estimated Posterior Probabilities (rounded to 3 digits) for Genotypes 0/0, 0/1 and 1/1">\n', '##INFO=<ID=INFO,Number=1,Type=Float,Description="Impute2 info metric">\n', '##imputerdate={}\n'.format(datetime.date.today().strftime("%m-%d-%y")) ] header.insert(-2, new_header[0]) header.insert(-4, new_header[1]) header.insert(1, new_header[2]) header = ''.join(header) # combine all vcfs os.chdir(settings.BASE_DIR) combine_command = ['imputer/combine_chrom.sh', '{}'.format(oh_id)] process = run(combine_command, stdout=PIPE, stderr=PIPE) logger.debug(process.stderr) member_vcf_fp = '{}/{}/member.imputed.vcf'.format(OUT_DIR, oh_id) # add the header to the combined vcf file. with open(member_vcf_fp, 'r') as original: data = original.read() with open(member_vcf_fp, 'w') as modified: modified.write(header + data) # bzip the file with open(member_vcf_fp, 'rb') as input_: with bz2.BZ2File(member_vcf_fp + '.bz2', 'wb', compresslevel=9) as output: copyfileobj(input_, output) # upload file to OpenHumans process_source(oh_id) # Message Member oh_member = OpenHumansMember.objects.get(oh_id=oh_id) project_page = environ.get('OH_ACTIVITY_PAGE') api.message( 'Open Humans Imputation Complete', 'Check {} to see your imputed genotype results from Open Humans.'. format(project_page), oh_member.access_token, project_member_ids=[oh_id]) logger.info('{} emailed member'.format(oh_id)) # clean users files if not settings.DEBUG: os.chdir(settings.BASE_DIR) clean_command = ['imputer/clean_files.sh', '{}'.format(oh_id)] process = run(clean_command, stdout=PIPE, stderr=PIPE) logger.debug(process.stderr) logger.info('{} finished removing files'.format(oh_id)) imputer_record = ImputerMember.objects.get(oh_id=oh_id, active=True) imputer_record.step = 'complete' imputer_record.active = False imputer_record.save()
def process_file(dfile, access_token, member, metadata): try: infile_suffix = dfile['basename'].split(".")[-1] tf_in = tempfile.NamedTemporaryFile(suffix="." + infile_suffix) tf_in.write(requests.get(dfile['download_url']).content) tf_in.flush() tmp_directory = tempfile.mkdtemp() filename_base = 'AncestryDNA-genotyping' raw_ancestry, chr_sex = clean_raw_ancestrydna(tf_in) raw_ancestry.seek(0) vcf_ancestry_unsort = vcf_from_raw_ancestrydna(raw_ancestry, chr_sex) # Save raw Ancestry genotyping to temp file. raw_filename = filename_base + '.txt' raw_filename = temp_join(tmp_directory, raw_filename) metadata = { 'description': 'AncestryDNA full genotyping data, original format', 'tags': ['AncestryDNA', 'genotyping'], 'creation_date': arrow.get().format(), } with open(raw_filename, 'w') as raw_file: raw_ancestry.seek(0) shutil.copyfileobj(raw_ancestry, raw_file) raw_file.flush() api.upload_aws(raw_filename, metadata, access_token, base_url=OH_BASE_URL, project_member_id=str(member['project_member_id'])) # Save VCF Ancestry genotyping to temp file. vcf_filename = filename_base + '.vcf.bz2' vcf_filename = temp_join(tmp_directory, vcf_filename) metadata = { 'description': 'AncestryDNA full genotyping data, VCF format', 'tags': ['AncestryDNA', 'genotyping', 'vcf'], 'creation_date': arrow.get().format() } vcf_ancestry_unsort.seek(0) vcf_ancestry = sort_vcf(vcf_ancestry_unsort) with bz2.BZ2File(vcf_filename, 'w') as vcf_file: vcf_ancestry.seek(0) for i in vcf_ancestry: vcf_file.write(i) api.upload_aws(vcf_filename, metadata, access_token, base_url=OH_BASE_URL, project_member_id=str(member['project_member_id'])) except: api.message("AncestryDNA integration: A broken file was deleted", "While processing your AncestryDNA file " "we noticed that your file does not conform " "to the expected specifications and it was " "thus deleted. Please make sure you upload " "the right file:\nWe expect the file to be a " "single txt file (either unzipped, bz2 zipped or gzipped) " "or a .zip file that contains a single txt file (this is " " what you can download from Ancestry right away) Please " "do not alter the original txt file, as unexpected " "additions can invalidate the file.", access_token, base_url=OH_BASE_URL) raise finally: api.delete_file(access_token, str(member['project_member_id']), file_id=str(dfile['id']), base_url=OH_BASE_URL)