def env454run_info_upload(runobj):

    my_read_csv = dbUpload(runobj)
    start = time()
    my_read_csv.put_run_info()
    elapsed = (time() - start)
    print "put_run_info time = %s" % str(elapsed)
    def setUpClass(cls):
        cls._connection = dbup.MyConnection(host = "vampsdev", db = "test")
        msql = "SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;" 
        cls._connection.execute_no_fetch(msql) 
        msql = "SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;" 
        cls._connection.execute_no_fetch(msql) 
        msql = "SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL';" 
        cls._connection.execute_no_fetch(msql) 
        
        data_object = fake_data_object.data_object
        root_dir      = '/Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test'
        cls.file_path = os.path.join(root_dir, data_object['general']['platform'], data_object['general']['run'], 'lane_1', 'analysis') 
        pi_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
        cls._runobj = Run(data_object, pi_path)    
#        cls._runobj = Run(data_object, os.path.dirname(os.path.realpath(__file__)))    

        cls._my_db_upload = dbup.dbUpload(cls._runobj)

        cls.filenames   = []
        cls.seq_id_dict = {}
        cls.fasta_file_path = cls.file_path + "/reads_overlap/ATCACG_NNNNGTATC_3-PERFECT_reads.fa.unique"
        cls.stats_file  = cls.file_path + "/unique_file_counts_test"
        cls.fasta       = u.SequenceSource(cls.fasta_file_path, lazy_init = False) 
        cls.fasta.seq   = "TGGGTTTGAACTACTGAGGGCCGGTACAGAGATGTACCCTTCCCTTCGGGGACTTCAGGAG"
        cls.fasta.id    = "D4ZHLFP1:25:B022DACXX:3:1101:14017:2243 1:N:0:ATCACG|frequency:1"
Ejemplo n.º 3
0
def env454upload_main(runobj, full_upload):
    """
    Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables)
    Tables:
    sequence_ill
    sequence_pdr_info_ill
    taxonomy
    sequence_uniq_info_ill

    """

    whole_start     = time.time()

    my_env454upload = dbUpload(runobj)
    filenames       = my_env454upload.get_fasta_file_names()
    if not filenames:
        logger.debug("\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_env454upload.fasta_dir)

#     sequences = get_sequences(my_env454upload, filenames)
    for filename in filenames:
        sequences = my_env454upload.make_seq_upper(filename)
        if full_upload:
            env454upload_seq(my_env454upload, filename, sequences)
        wrapped   = wrapper(my_env454upload.get_seq_id_dict, sequences)
        get_seq_id_dict_time = timeit.timeit(wrapped, number=1)
        logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time)

    total_seq = env454upload_all_but_seq(my_env454upload, filenames, full_upload)
    my_env454upload.check_seq_upload()
    logger.debug("total_seq = %s" % total_seq)
    whole_elapsed = (time.time() - whole_start)
    print "The whole upload took %s s" % whole_elapsed
 def test_e_setUpRunInfo(self):
     my_read_csv = dbup.dbUpload(self._runobj)
     my_read_csv.put_run_info()
     sql = "SELECT max(run_info_ill_id) FROM run_info_ill"
     self.assertEqual(self.get_id(sql), 10)        
     print "done with put_run_info" 
def env454upload(runobj):  
    """
    Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s env454upload -l debug
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) 
    TODO: 
        2) Upload env454 data into raw, trim, gast etc tables from files
    """
    
    whole_start = time()

#    my_read_csv = readCSV(run)
#    my_read_csv.read_csv()
    
    my_env454upload = dbUpload(runobj)
    filenames   = my_env454upload.get_fasta_file_names()
    seq_in_file = 0
    total_seq   = 0
    for filename in filenames:
        try:
            logger.debug("\n----------------\nfilename = %s" % filename)
            fasta_file_path = filename
            filename_base   = "-".join(filename.split("/")[-1].split("-")[:-1])
            if (filename_base == ""):
#                For v4v5 illumia
                filename_base   = "_".join(filename.split("/")[-1].split("_")[:3])                
            run_info_ill_id = my_env454upload.get_run_info_ill_id(filename_base)
            gast_dict       = my_env454upload.get_gasta_result(filename)
            read_fasta      = u.ReadFasta(fasta_file_path)
#             sequences       = read_fasta.sequences
            sequences       = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility

            if not (len(sequences)):
                continue            
            read_fasta.close()
            fasta           = u.SequenceSource(fasta_file_path, lazy_init = False) 

            insert_seq_time      = 0   
            get_seq_id_dict_time = 0
            insert_pdr_info_time = 0
            insert_taxonomy_time = 0
            insert_sequence_uniq_info_ill_time = 0
            
            start = time()

            my_env454upload.insert_seq(sequences)
            elapsed = (time() - start)
            insert_seq_time = elapsed
            logger.debug("seq_in_file = %s" % seq_in_file)
            logger.debug("insert_seq() took %s time to finish" % insert_seq_time)
#            print "insert_seq() took ", elapsed, " time to finish"
            start = time()
            my_env454upload.get_seq_id_dict(sequences)
            elapsed = (time() - start)
            get_seq_id_dict_time = elapsed
            logger.debug("get_seq_id_dict() took %s time to finish" % get_seq_id_dict_time)
            
            while fasta.next():
#                sequence_ill_id = my_env454upload.get_sequence_id(fasta.seq)
                start = time()
#                print "Inserting pdr info"
#                for attr in dir(fasta):
#                  print "obj.%s = %s" % (attr, getattr(fasta, attr))

                my_env454upload.insert_pdr_info(fasta, run_info_ill_id)
                elapsed = (time() - start)
                insert_pdr_info_time += elapsed
#                print "insert_pdr_info() took ", elapsed, " time to finish"                

                start = time()
#                print "Inserting taxonomy"
                my_env454upload.insert_taxonomy(fasta, gast_dict)

                elapsed = (time() - start)
                insert_taxonomy_time += elapsed

#                print "tax_id = ", tax_id ,"; insert_taxonomy() took ", elapsed, " time to finish"                
#                print "tax_id = ", tax_id            

                start = time()
#                print "Inserting sequence_uniq_info_ill"
                my_env454upload.insert_sequence_uniq_info_ill(fasta, gast_dict)
                elapsed = (time() - start)
                insert_sequence_uniq_info_ill_time += elapsed

            seq_in_file = fasta.total_seq
            my_env454upload.put_seq_statistics_in_file(filename, fasta.total_seq)
            total_seq += seq_in_file
            logger.debug("insert_pdr_info() took %s time to finish" % insert_pdr_info_time)
            logger.debug("insert_taxonomy_time() took %s time to finish" % insert_taxonomy_time)
            logger.debug("insert_sequence_uniq_info_ill() took %s time to finish" % insert_sequence_uniq_info_ill_time)

        except:                       # catch everything
            print "\r[pipelineprocessor] Unexpected:"         # handle unexpected exceptions
            print sys.exc_info()[0]     # info about curr exception (type,value,traceback)
            raise                       # re-throw caught exception   
#    print "total_seq = %s" % total_seq
    my_env454upload.check_seq_upload()
    logger.debug("total_seq = %s" % total_seq)
    whole_elapsed = (time() - whole_start)
    print "The whole_upload took %s s" % whole_elapsed
Ejemplo n.º 6
0
def env454run_info_upload(runobj):
    my_read_csv = dbUpload(runobj)
    wrapped   = wrapper(my_read_csv.put_run_info)
    print "put_run_info time = %s" % timeit.timeit(wrapped, number=1) 
Ejemplo n.º 7
0
def file_to_db_upload_main(runobj, full_upload):
    """
    Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s file_to_db_upload -l debug
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables)
    Tables:
    sequence_ill
    sequence_pdr_info_ill
    taxonomy
    sequence_uniq_info_ill
    reset AUTO_INCREMENT
    """
    whole_start = time.time()

    #     my_file_to_db_upload = dbUpload(runobj, db_server="vamps2")
    try:
        db_name = runobj.database_name
    except:
        db_name = "env454"
    my_file_to_db_upload = dbUpload(runobj, db_name=db_name)

    #     dbUpload(runobj)
    #     filenames       = my_file_to_db_upload.get_fasta_file_names()
    if not my_file_to_db_upload.filenames:
        err_msg = "\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_file_to_db_upload.fasta_dir
        my_file_to_db_upload.all_errors.append(err_msg)
        logger.debug(err_msg)

    #     sequences = get_sequences(my_file_to_db_upload, filenames)
    get_and_up_seq_time = time.time()
    total_time = 0
    no_run_info_list = []

    for filename in my_file_to_db_upload.filenames:
        sequences = my_file_to_db_upload.seq.prepare_fasta_dict(filename)
        if not (len(sequences)):
            err_msg = "There are 0 sequences in filename = %s" % filename
            logger.debug(err_msg)
            my_file_to_db_upload.all_errors.append(err_msg)
            continue
        if full_upload:
            file_to_db_upload_seq(my_file_to_db_upload, filename, sequences)
        wrapped = wrapper(my_file_to_db_upload.seq.get_seq_id_dict, sequences)
        get_seq_id_dict_time = timeit.timeit(wrapped, number=1)
        logger.debug("get_seq_id_dict() took %s sec to finish" %
                     get_seq_id_dict_time)

        get_and_up_seq_time_end = (time.time() - get_and_up_seq_time)
        logger.debug("get_and_up_seq took %s s" % get_and_up_seq_time_end)

        start_c = time.time()
        total_time = total_time + file_to_db_upload_all_but_seq(
            my_file_to_db_upload, filename, no_run_info_list, full_upload)
        logger.debug("file_to_db_upload_all_but_seq() took %s sec to finish" %
                     (time.time() - start_c))

    # doesn't work with mysql 5.6', not needed with no gap auto_increment
    # my_file_to_db_upload.reset_auto_increment()
    seq_count_msg = my_file_to_db_upload.check_seq_upload()

    projects_and_ids = my_file_to_db_upload.get_projects_and_ids()

    utils = PipelneUtils()
    if db_name == 'vamps2':
        my_email = '*****@*****.**'
    else:
        my_email = '*****@*****.**'

    ready_email_body = """Uploaded to %s on %s\nIn this run %s: %s\n%s\n%s
    """ % (runobj.database_name, runobj.database_host, runobj.run,
           projects_and_ids, my_file_to_db_upload.equal_amnt_files_txt,
           seq_count_msg)

    my_file_to_db_upload.send_message(my_email,
                                      'Projects uploaded to %s' % db_name,
                                      ready_email_body)

    if len(no_run_info_list) > 0:
        err_msg = "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % ", ".join(
            no_run_info_list)
        utils.print_both(err_msg)
        my_file_to_db_upload.all_errors.append(err_msg)

    logger.debug("From file_to_db_upload_main. ready_email_body: ")
    logger.debug(ready_email_body)

    my_file_to_db_upload.all_errors.extend(my_file_to_db_upload.seq.seq_errors)
    if len(my_file_to_db_upload.all_errors) > 0:
        logger.debug('\n=====\nERRORS: \n' +
                     ';\n'.join(my_file_to_db_upload.all_errors))

    logger.debug("total_time = %s" % total_time)
    whole_elapsed = (time.time() - whole_start)
    logger.debug("The whole upload took %s s" % whole_elapsed)
Ejemplo n.º 8
0
def run_info_upload(runobj):
    logger.debug("Start Run info upload to db")

    my_read_csv = dbUpload(runobj)
    wrapped = wrapper(my_read_csv.put_run_info)
    logger.debug("put_run_info time = %s" % timeit.timeit(wrapped, number=1))
def file_to_db_upload_main(runobj, full_upload):
    """
    Run: pipeline dbUpload testing -c test/data/JJH_KCK_EQP_Bv6v4.ini -s file_to_db_upload -l debug
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables)
    Tables:
    sequence_ill
    sequence_pdr_info_ill
    taxonomy
    sequence_uniq_info_ill
    reset AUTO_INCREMENT
    """
    whole_start = time.time()

    #     my_file_to_db_upload = dbUpload(runobj, db_server="vamps2")
    try:
        db_name = runobj.database_name
    except:
        db_name = "env454"
    my_file_to_db_upload = dbUpload(runobj, db_name = db_name)

    #     dbUpload(runobj)
    #     filenames       = my_file_to_db_upload.get_fasta_file_names()
    if not my_file_to_db_upload.filenames:
        err_msg = "\nThere is something wrong with fasta files or their names, please check pathes, contents and suffixes in %s." % my_file_to_db_upload.fasta_dir
        my_file_to_db_upload.all_errors.append(err_msg)
        logger.debug(err_msg)

    #     sequences = get_sequences(my_file_to_db_upload, filenames)
    get_and_up_seq_time = time.time()
    total_time = 0
    no_run_info_list = []

    for filename in my_file_to_db_upload.filenames:
        sequences = my_file_to_db_upload.seq.prepare_fasta_dict(filename)
        if not (len(sequences)):
            err_msg = "There are 0 sequences in filename = %s" % filename
            logger.debug(err_msg)
            my_file_to_db_upload.all_errors.append(err_msg)
            continue
        if full_upload:
            file_to_db_upload_seq(my_file_to_db_upload, filename, sequences)
        wrapped = wrapper(my_file_to_db_upload.seq.get_seq_id_dict, sequences)
        get_seq_id_dict_time = timeit.timeit(wrapped, number = 1)
        logger.debug("get_seq_id_dict() took %s sec to finish" % get_seq_id_dict_time)

        get_and_up_seq_time_end = (time.time() - get_and_up_seq_time)
        logger.debug("get_and_up_seq took %s s" % get_and_up_seq_time_end)

        start_c = time.time()
        total_time = total_time + file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list,
                                                                full_upload)
        logger.debug("file_to_db_upload_all_but_seq() took %s sec to finish" % (time.time() - start_c))

    # doesn't work with mysql 5.6', not needed with no gap auto_increment
    # my_file_to_db_upload.reset_auto_increment()
    seq_count_msg = my_file_to_db_upload.check_seq_upload()

    projects_and_ids = my_file_to_db_upload.get_projects_and_ids()

    utils = PipelneUtils()
    if db_name == 'vamps2':
        my_email = '*****@*****.**'
    else:
        my_email = '*****@*****.**'

    ready_email_body = """Uploaded to %s on %s\nIn this run %s: %s\n%s\n%s
    """ % (runobj.database_name, runobj.database_host, runobj.run, projects_and_ids, my_file_to_db_upload.equal_amnt_files_txt, seq_count_msg)

    my_file_to_db_upload.send_message(my_email, 'Projects uploaded to %s' % db_name, ready_email_body)

    if len(no_run_info_list) > 0:
        err_msg = "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % ", ".join(
            no_run_info_list)
        utils.print_both(err_msg)
        my_file_to_db_upload.all_errors.append(err_msg)

    logger.debug("From file_to_db_upload_main. ready_email_body: ")
    logger.debug(ready_email_body)

    my_file_to_db_upload.all_errors.extend(my_file_to_db_upload.seq.seq_errors)
    if len(my_file_to_db_upload.all_errors) > 0:
        logger.debug('\n=====\nERRORS: \n' + ';\n'.join(my_file_to_db_upload.all_errors))

    logger.debug("total_time = %s" % total_time)
    whole_elapsed = (time.time() - whole_start)
    logger.debug("The whole upload took %s s" % whole_elapsed)
def run_info_upload(runobj):
    logger.debug("Start Run info upload to db")

    my_read_csv = dbUpload(runobj)
    wrapped = wrapper(my_read_csv.put_run_info)
    logger.debug("put_run_info time = %s" % timeit.timeit(wrapped, number = 1))
def env454upload(runobj):
    """
    For now upload only Illumina data to env454 from files, assuming that all run info is already on env454 (run, run_key, dataset, project, run_info_ill tables) 
    TODO: 
        2) Upload env454 data into raw, trim, gast etc tables from files
    """

    whole_start = time()

    #    my_read_csv = readCSV(run)
    #    my_read_csv.read_csv()

    my_env454upload = dbUpload(runobj)
    filenames = my_env454upload.get_fasta_file_names()
    seq_in_file = 0
    total_seq = 0
    for filename in filenames:
        try:
            logger.debug("\n----------------\nfilename = %s" % filename)
            fasta_file_path = filename
            filename_base = "-".join(filename.split("/")[-1].split("-")[:-1])
            run_info_ill_id = my_env454upload.get_run_info_ill_id(filename_base)
            gast_dict = my_env454upload.get_gasta_result(filename)
            read_fasta = u.ReadFasta(fasta_file_path)
            sequences = read_fasta.sequences
            if not (len(sequences)):
                continue
            read_fasta.close()
            fasta = u.SequenceSource(fasta_file_path, lazy_init=False)

            insert_seq_time = 0
            get_seq_id_dict_time = 0
            insert_pdr_info_time = 0
            insert_taxonomy_time = 0
            insert_sequence_uniq_info_ill_time = 0

            start = time()

            my_env454upload.insert_seq(sequences)
            elapsed = time() - start
            insert_seq_time = elapsed
            logger.debug("seq_in_file = %s" % seq_in_file)
            logger.debug("insert_seq() took %s time to finish" % insert_seq_time)
            #            print "insert_seq() took ", elapsed, " time to finish"
            start = time()
            my_env454upload.get_seq_id_dict(sequences)
            elapsed = time() - start
            get_seq_id_dict_time = elapsed
            logger.debug("get_seq_id_dict() took %s time to finish" % get_seq_id_dict_time)

            while fasta.next():
                #                sequence_ill_id = my_env454upload.get_sequence_id(fasta.seq)
                start = time()
                #                print "Inserting pdr info"
                my_env454upload.insert_pdr_info(fasta, run_info_ill_id)
                elapsed = time() - start
                insert_pdr_info_time += elapsed
                #                print "insert_pdr_info() took ", elapsed, " time to finish"

                start = time()
                #                print "Inserting taxonomy"
                my_env454upload.insert_taxonomy(fasta, gast_dict)

                elapsed = time() - start
                insert_taxonomy_time += elapsed

                #                print "tax_id = ", tax_id ,"; insert_taxonomy() took ", elapsed, " time to finish"
                #                print "tax_id = ", tax_id

                start = time()
                #                print "Inserting sequence_uniq_info_ill"
                my_env454upload.insert_sequence_uniq_info_ill(fasta, gast_dict)
                elapsed = time() - start
                insert_sequence_uniq_info_ill_time += elapsed

            seq_in_file = fasta.total_seq
            my_env454upload.put_seq_statistics_in_file(filename, fasta.total_seq)
            total_seq += seq_in_file
            logger.debug("insert_pdr_info() took %s time to finish" % insert_pdr_info_time)
            logger.debug("insert_taxonomy_time() took %s time to finish" % insert_taxonomy_time)
            logger.debug("insert_sequence_uniq_info_ill() took %s time to finish" % insert_sequence_uniq_info_ill_time)

        #        except Exception, e:          # catch all deriving from Exception (instance e)
        ##            sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))
        #            frameinfo = getframeinfo(currentframe())
        #            print frameinfo.filename, frameinfo.lineno
        #            print "\r[pipelineprocessor] Exception: ", e.__str__()      # address the instance, print e.__str__()
        ##            raise                       # re-throw caught exception
        except:  # catch everything
            print "\r[pipelineprocessor] Unexpected:"  # handle unexpected exceptions
            print sys.exc_info()[0]  # info about curr exception (type,value,traceback)
            raise  # re-throw caught exception
    print "total_seq = %s" % total_seq
    my_env454upload.check_seq_upload()
    logger.debug("total_seq = %s" % total_seq)
    whole_elapsed = time() - whole_start
    print "The whole_upload took %s s" % whole_elapsed