def __init__(
        self,
        igf_id_list,
        table_name,
        pipeline_name,
        dbconfig_file,
        log_slack=True,
        log_asana=True,
        slack_config=None,
        asana_project_id=None,
        asana_config=None,
        clean_up=True,
    ):
        '''
    :param igf_id_list: A list of igf ids to uniquely identify the entity
    :param table_name: A database table name to look for the igf id
                       available options are 'project','sample','experiment','run',
                                             'file','seqrun','collection'
    :param pipeline_name: A pipeline name to change the status of the seed
    :param dbconfig_file: A file containing the database configuration
    :param log_slack: A boolean flag for toggling Slack messages, default True
    :param log_asana: Aboolean flag for toggling Asana message, default True
    :param slack_config: A file containing Slack tokens, default None
    :param asana_config: A file containing Asana tokens, default None
    :param asana_project_id: A numeric Asana project id, default is None
    :param clean_up: Clean up input file once its processed, default True
    '''
        try:
            self.igf_id_list = igf_id_list
            if table_name not in ('project', 'sample', 'experiment', 'run',
                                  'file', 'seqrun', 'collection'):
                raise ValueError('Table {0} not supported for pipeline seed'.\
                                 format(table_name))
            self.table_name = table_name
            self.pipeline_name = pipeline_name
            self.clean_up = clean_up
            dbparams = read_dbconf_json(dbconfig_file)
            self.base_adaptor = BaseAdaptor(**dbparams)
            self.log_slack = log_slack
            self.log_asana = log_asana
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config)  # add slack object

            if log_asana and \
               (asana_config is None or \
                asana_project_id is None):
                raise ValueError(
                    'Missing asana config file or asana project id')
            elif log_asana and asana_config and asana_project_id:
                self.igf_asana = IGF_asana(
                    asana_config, asana_project_id)  # add asana object
        except:
            raise
Example #2
0
    def __init__(self,
                 projet_info_path,
                 dbconfig,
                 user_account_template,
                 log_slack=True,
                 slack_config=None,
                 check_hpc_user=False,
                 hpc_user=None,
                 hpc_address=None,
                 ldap_server=None,
                 setup_irods=True,
                 notify_user=True,
                 default_user_email='*****@*****.**',
                 project_lookup_column='project_igf_id',
                 user_lookup_column='email_id',
                 data_authority_column='data_authority',
                 sample_lookup_column='sample_igf_id',
                 barcode_check_keyword='barcode_check',
                 metadata_sheet_name='Project metadata',
                 sendmail_exe='/usr/sbin/sendmail'):
        try:
            self.projet_info_path = projet_info_path
            self.user_account_template = user_account_template
            self.project_lookup_column = project_lookup_column
            self.user_lookup_column = user_lookup_column
            self.sample_lookup_column = sample_lookup_column
            self.data_authority_column = data_authority_column
            self.log_slack = log_slack
            dbparams = read_dbconf_json(dbconfig)
            base = BaseAdaptor(**dbparams)
            self.session_class = base.get_session_class()
            self.setup_irods = setup_irods
            self.notify_user = notify_user
            self.default_user_email = default_user_email
            self.barcode_check_keyword = barcode_check_keyword
            self.check_hpc_user = check_hpc_user
            self.hpc_user = hpc_user
            self.hpc_address = hpc_address
            self.ldap_server = ldap_server
            self.metadata_sheet_name = metadata_sheet_name
            self.sendmail_exe = sendmail_exe
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config=slack_config)

            if check_hpc_user and (hpc_user is None or \
                                   hpc_address is None or \
                                   ldap_server is None):
                raise ValueError('Hpc user {0} address {1}, and ldap server {2} are required for check_hpc_user'.\
                                 format(hpc_user,hpc_address,ldap_server))
        except:
            raise
Example #3
0
    def __init__(self,
                 seqrun_path,
                 seqrun_igf_list,
                 dbconfig_file,
                 clean_up=True,
                 json_collection_type='ILLUMINA_BCL_MD5',
                 log_slack=True,
                 log_asana=True,
                 slack_config=None,
                 asana_project_id=None,
                 asana_config=None,
                 samplesheet_name='SampleSheet.csv'):
        '''
    :param seqrun_path: A directory path for sequencing run home
    :param seqrun_igf_list: A file path listing sequencing runs to reset
    :param dbconfig_file: A file containing the database configuration
    :param clean_up: Clean up input file once its processed, default True
    :param json_collection_type: A collection type for md5 json file lookup, default ILLUMINA_BCL_MD5
    :param log_slack: A boolean flag for toggling Slack messages, default True
    :param log_asana: Aboolean flag for toggling Asana message, default True
    :param slack_config: A file containing Slack tokens, default None
    :param asana_config: A file containing Asana tokens, default None
    :param asana_project_id: A numeric Asana project id, default is None
    :param samplesheet_name: Name of the samplesheet file, default SampleSheet.csv
    '''
        try:
            self.seqrun_path = seqrun_path
            self.seqrun_igf_list = seqrun_igf_list
            self.json_collection_type = json_collection_type
            self.log_slack = log_slack
            self.log_asana = log_asana
            self.clean_up = clean_up
            self.samplesheet_name = samplesheet_name
            dbparams = read_dbconf_json(dbconfig_file)
            self.base_adaptor = BaseAdaptor(**dbparams)
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config)  # add slack object

            if log_asana and \
               (asana_config is None or \
                asana_project_id is None):
                raise ValueError(
                    'Missing asana config file or asana project id')
            elif log_asana and asana_config and asana_project_id:
                self.igf_asana = IGF_asana(
                    asana_config, asana_project_id)  # add asana object
        except:
            raise
 def __init__(self,dbconfig_file,log_slack=True,slack_config=None):
   '''
   :param dbconfig_file: A database configuration file path
   :param log_slack: A boolean flag for toggling Slack messages, default True
   :param slack_config: A file containing Slack tokens, default None
   '''
   try:
     dbparams = read_dbconf_json(dbconfig_file)
     self.base_adaptor=BaseAdaptor(**dbparams)
     self.log_slack=log_slack
     if log_slack and slack_config is None:
       raise ValueError('Missing slack config file')
     elif log_slack and slack_config:
       self.igf_slack = IGF_slack(slack_config)                                # add slack object
   except:
     raise
Example #5
0
    def fetch_input(self):
        '''
    Fetch input method for base runnable
    
    :param dbconfig: A database configuration json file
    :param log_slack: A toggle for writing logs to slack
    :param log_asana: A toggle for writing logs to asana 
    '''
        try:
            dbconfig = self.param_required('dbconfig')
            dbparams = read_dbconf_json(dbconfig)
            base = BaseAdaptor(**dbparams)
            session_class = base.get_session_class()
            self.param('igf_session_class',
                       session_class)  # set session class for pipeline

            if self.param('log_slack'):
                slack_config = self.param_required('slack_config')
                igf_slack = IGF_slack(slack_config=slack_config)
                self.param('igf_slack', igf_slack)

        except:
            raise
Example #6
0
from igf_data.utils.dbutils import clean_and_rebuild_database

parser = argparse.ArgumentParser()
parser.add_argument('-d',
                    '--dbconfig_path',
                    required=True,
                    help='Database configuration json file')
parser.add_argument('-s',
                    '--slack_config',
                    required=True,
                    help='Slack configuration json file')
args = parser.parse_args()

dbconfig_path = args.dbconfig_path
slack_config = args.slack_config

slack_obj = IGF_slack(slack_config=slack_config)

if __name__ == '__main__':
    try:
        clean_and_rebuild_database(dbconfig=dbconfig_path)
        slack_obj.post_message_to_channel(
            message=
            'All old data removed from database and new tables are created',
            reaction='pass')
    except Exception as e:
        message = 'Failed to remove old data and create new tables, error: {0}'.format(
            e)
        slack_obj.post_message_to_channel(message, reaction='fail')
        raise ValueError(message)
class Modify_pipeline_seed:
    '''
  A class for changing pipeline run status in the pipeline_seed table
  '''
    def __init__(
        self,
        igf_id_list,
        table_name,
        pipeline_name,
        dbconfig_file,
        log_slack=True,
        log_asana=True,
        slack_config=None,
        asana_project_id=None,
        asana_config=None,
        clean_up=True,
    ):
        '''
    :param igf_id_list: A list of igf ids to uniquely identify the entity
    :param table_name: A database table name to look for the igf id
                       available options are 'project','sample','experiment','run',
                                             'file','seqrun','collection'
    :param pipeline_name: A pipeline name to change the status of the seed
    :param dbconfig_file: A file containing the database configuration
    :param log_slack: A boolean flag for toggling Slack messages, default True
    :param log_asana: Aboolean flag for toggling Asana message, default True
    :param slack_config: A file containing Slack tokens, default None
    :param asana_config: A file containing Asana tokens, default None
    :param asana_project_id: A numeric Asana project id, default is None
    :param clean_up: Clean up input file once its processed, default True
    '''
        try:
            self.igf_id_list = igf_id_list
            if table_name not in ('project', 'sample', 'experiment', 'run',
                                  'file', 'seqrun', 'collection'):
                raise ValueError('Table {0} not supported for pipeline seed'.\
                                 format(table_name))
            self.table_name = table_name
            self.pipeline_name = pipeline_name
            self.clean_up = clean_up
            dbparams = read_dbconf_json(dbconfig_file)
            self.base_adaptor = BaseAdaptor(**dbparams)
            self.log_slack = log_slack
            self.log_asana = log_asana
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config)  # add slack object

            if log_asana and \
               (asana_config is None or \
                asana_project_id is None):
                raise ValueError(
                    'Missing asana config file or asana project id')
            elif log_asana and asana_config and asana_project_id:
                self.igf_asana = IGF_asana(
                    asana_config, asana_project_id)  # add asana object
        except:
            raise

    def _fetch_pipeline_seed_entry(self,
                                   igf_id,
                                   select_seed_status=None,
                                   restrict_seed_status=None):
        '''
    An internal method for fetching unique pipeline seed entry from database
    :param igf_id: A igf id to uniquely select pipe seed data
    :param select_seed_status: A list of seed status to include from the query, default None
    :param restrict_seed_status: A list of seed status to exclude from the query, default None
    '''
        try:
            query = None
            if self.table_name == 'seqrun':
                query=self.base_adaptor.session.\
                           query(Pipeline_seed).\
                           join(Seqrun,Pipeline_seed.seed_id==Seqrun.seqrun_id).\
                           join(Pipeline).\
                           filter(Seqrun.seqrun_igf_id==igf_id).\
                           filter(Pipeline_seed.seed_table==self.table_name).\
                           filter(Pipeline.pipeline_id==Pipeline_seed.pipeline_id).\
                           filter(Pipeline.pipeline_name==self.pipeline_name)           # get base query for seqrun table
            else:
                raise ValueError('Table {0} not supported for pipeline status reset'.\
                                 format(self.table))

            if select_seed_status is not None and \
                 isinstance(select_seed_status,list) and \
                 len(select_seed_status) > 0:
                query = query.filter(
                    Pipeline_seed.status.in_(
                        select_seed_status))  # add generic select filter

            if restrict_seed_status is not None and \
                 isinstance(restrict_seed_status,list) and \
                 len(restrict_seed_status)>0:
                query = query.filter(
                    not_(Pipeline_seed.status.in_(
                        restrict_seed_status)))  # add generic restrict filter

            pipeseed_data=self.base_adaptor.fetch_records(query,\
                                                          output_mode='one_or_none')  # fetch unique value for pipeline seed
            return pipeseed_data
        except:
            raise

    def reset_pipeline_seed_for_rerun(self,
                                      seeded_label='SEEDED',
                                      restricted_status_list=('SEEDED',
                                                              'RUNNING')):
        '''
    A method for setting the pipeline for re-run if the first run has failed or aborted
    This method will set the pipeline_seed.status as 'SEEDED' only if its not already
    'SEEDED' or 'RUNNING'
    :param seeded_label: A text label for seeded status, default SEEDED
    :param restricted_status_list: A list of pipeline status to exclude from the search,
                                   default ['SEEDED','RUNNING']
    '''
        try:
            db_connected = False
            restricted_status_list = list(restricted_status_list)
            input_id_list = self._read_input_list(
                igf_id_list=self.igf_id_list)  # get input ids from file
            failed_ids = list()  # define empty list of failed ids
            pass_list = list()  # required for logging in asana
            base = self.base_adaptor
            base.start_session()  # connect to database
            db_connected = True
            for igf_id in input_id_list:
                pipe_seed_data = self._fetch_pipeline_seed_entry(
                    igf_id=igf_id, restrict_seed_status=restricted_status_list
                )  # get pipe seed data for igf id
                if pipe_seed_data is None:
                    failed_ids.append(igf_id)  # add igf id to failed list
                else:
                    pl = PipelineAdaptor(**{'session': base.session
                                            })  # connect to pipeline adaptor
                    updated_seed_data = [{
                        'pipeline_id': pipe_seed_data.pipeline_id,
                        'seed_id': pipe_seed_data.seed_id,
                        'seed_table': pipe_seed_data.seed_table,
                        'status': seeded_label
                    }]  # set data for seed update
                    pl.update_pipeline_seed(
                        data=updated_seed_data,
                        autosave=False)  # update data to pipeline seed table
                    pass_list.append(igf_id)
            base.commit_session()  # save data to database after all changes
            base.close_session()  # close database connection
            db_connected = False
            if self.clean_up:
                self._clear_input_list(
                    file_path=self.igf_id_list, igf_list=failed_ids
                )  # over write input list and add failed ids for next try
                message = 'Overwriting pipeseed input list {0}'.format(
                    self.igf_id_list)
                if self.log_slack:
                    self.igf_slack.post_message_to_channel(
                        message, reaction='pass'
                    )  # comment to slack for file over writing
            if len(pass_list) > 0:
                for id_line in pass_list:
                    message='Changed pipeline seed for id {0}, pipeline {1}, to {2}'.\
                            format(id_line,self.pipeline_name,seeded_label)
                    if self.log_slack:
                        self.igf_slack.post_message_to_channel(
                            message,
                            reaction='pass')  # comment to slack channel
                    if self.log_asana:
                        self.igf_asana.comment_asana_task(
                            task_name=id_line,
                            comment=message)  # comment on asana task
        except Exception as e:
            if db_connected:
                base.rollback_session()
                base.close_session()
                message = 'Failed to update pipeline seed, Error: {0}'.format(
                    e)
                warnings.warn(message)
            if self.log_slack:
                self.igf_slack.post_message_to_channel(message,
                                                       reaction='fail')
            raise

    @staticmethod
    def _clear_input_list(file_path, igf_list):
        '''
    A static method for clearing the seqrun list file
    :param seqrun_igf_list: A file containing the sequencing run ids
    '''
        try:
            if not os.path.exists(file_path):
                raise IOError('File {0} not found'.format(file_path))

            with open(file_path, 'w') as fwp:
                fwp.write('\n'.join(igf_list))  # over write input list file
        except:
            raise

    @staticmethod
    def _read_input_list(igf_id_list):
        '''
    A static method for reading list of ids from an input file
    to a list
    :param igf_id_list: A file containing the input igf ids
    :return list: A list of ids from the input file
    '''
        try:
            if not os.path.exists(igf_id_list):
                raise IOError('File {0} not found'.format(seqrun_igf_list))

            id_list = list()  # define an empty list of igf ids
            with open(igf_id_list, 'r') as fp:
                id_list = [i.strip() for i in fp]  # add ids to the list
            return id_list
        except:
            raise
 def __init__(self, slack_config_json, project_data_file):
     self.project_data_file = project_data_file
     self.igf_slack = IGF_slack(
         slack_config=slack_config_json)  # create slack client instance
Example #9
0
class Find_and_register_new_project_data:
    '''
  A class for finding new data for project and registering them to the db.
  Account for new users will be created in irods server and password will be
  mailed to them.
  
  :param projet_info_path: A directory path for project info files
  :param dbconfig: A json dbconfig file
  :param check_hpc_user: Guess the hpc user name, True or False, default: False
  :param hpc_user: A hpc user name, default is None
  :param hpc_address: A hpc host address, default is None
  :param ldap_server: A ldap server address for search, default is None
  :param user_account_template: A template file for user account activation email
  :param log_slack: Enable or disable sending message to slack, default: True
  :param slack_config: A slack config json file, required if log_slack is True
  :param project_lookup_column: project data lookup column, default project_igf_id
  :param user_lookup_column: user data lookup column, default email_id
  :param sample_lookup_column: sample data lookup column, default sample_igf_id
  :param data_authority_column: data authority column name, default data_authority
  :param setup_irods: Setup irods account for user, default is True
  :param notify_user: Send email notification to user, default is True
  :param default_user_email: Add another user as the default collaborator for all new projects, default [email protected]
  :param barcode_check_keyword: Project attribute name for barcode check settings, default barcode_check
  :param sendmail_exe: Sendmail executable path, default /usr/sbin/sendmail
  '''
    def __init__(self,
                 projet_info_path,
                 dbconfig,
                 user_account_template,
                 log_slack=True,
                 slack_config=None,
                 check_hpc_user=False,
                 hpc_user=None,
                 hpc_address=None,
                 ldap_server=None,
                 setup_irods=True,
                 notify_user=True,
                 default_user_email='*****@*****.**',
                 project_lookup_column='project_igf_id',
                 user_lookup_column='email_id',
                 data_authority_column='data_authority',
                 sample_lookup_column='sample_igf_id',
                 barcode_check_keyword='barcode_check',
                 metadata_sheet_name='Project metadata',
                 sendmail_exe='/usr/sbin/sendmail'):
        try:
            self.projet_info_path = projet_info_path
            self.user_account_template = user_account_template
            self.project_lookup_column = project_lookup_column
            self.user_lookup_column = user_lookup_column
            self.sample_lookup_column = sample_lookup_column
            self.data_authority_column = data_authority_column
            self.log_slack = log_slack
            dbparams = read_dbconf_json(dbconfig)
            base = BaseAdaptor(**dbparams)
            self.session_class = base.get_session_class()
            self.setup_irods = setup_irods
            self.notify_user = notify_user
            self.default_user_email = default_user_email
            self.barcode_check_keyword = barcode_check_keyword
            self.check_hpc_user = check_hpc_user
            self.hpc_user = hpc_user
            self.hpc_address = hpc_address
            self.ldap_server = ldap_server
            self.metadata_sheet_name = metadata_sheet_name
            self.sendmail_exe = sendmail_exe
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config=slack_config)

            if check_hpc_user and (hpc_user is None or \
                                   hpc_address is None or \
                                   ldap_server is None):
                raise ValueError('Hpc user {0} address {1}, and ldap server {2} are required for check_hpc_user'.\
                                 format(hpc_user,hpc_address,ldap_server))
        except:
            raise

    def process_project_data_and_account(self):
        '''
    A method for finding new project info and registering them to database
    and user account creation
    '''
        try:
            new_project_info_list = self._find_new_project_info()
            if len(new_project_info_list) == 0:
                if self.log_slack:
                    self.igf_slack.post_message_to_channel(message='No project info found',\
                                                           reaction='sleep')

            for project_info_file in new_project_info_list:
                try:
                    new_data = self._read_project_info_and_get_new_entries(
                        project_info_file
                    )  # get new project, user and samples information
                    self._check_and_register_data(data=new_data,\
                                                  project_info_file=project_info_file)    # register data
                    if self.log_slack:
                        message='loaded new metadata from file {0}'.\
                              format(os.path.basename(project_info_file))
                        self.igf_slack.post_message_to_channel(message,
                                                               reaction='pass')

                except Exception as e:  # if error found in one file, skip the file
                    message='skipped project info file {0}, got error {1}'.\
                            format(project_info_file,e)
                    warnings.warn(message)
                    if self.log_slack:
                        self.igf_slack.post_message_to_channel(
                            message, reaction='fail')  # send message to slack
        except Exception as e:
            if self.log_slack:
                message = 'Error in registering project info: {0}'.format(e)
                self.igf_slack.post_message_to_channel(message,
                                                       reaction='fail')
            raise

    def _check_existing_data(self,
                             data,
                             dbsession,
                             table_name,
                             check_column='EXISTS'):
        '''
    An internal function for checking and registering project info
    
    :param data: A pandas data series
    :param dbsession: A sqlalchemy database session object
    :param table_name: A database table name
    :param check_column: Column name for existing data
    '''
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a data series and got {0}'.format(
                    type(data)))
            if table_name == 'project':
                if self.project_lookup_column in data and \
                   not pd.isnull(data[self.project_lookup_column]):
                    project_igf_id = data[self.project_lookup_column]
                    pa = ProjectAdaptor(**{'session': dbsession
                                           })  # connect to project adaptor
                    project_exists = pa.check_project_records_igf_id(
                        project_igf_id)
                    if project_exists:  # store data only if project is not existing
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError('Missing or empty required column {0}'.\
                                     format(self.project_lookup_column))
            elif table_name == 'user':
                if self.user_lookup_column in data and \
                   not pd.isnull(data[self.user_lookup_column]):
                    user_email = data[self.user_lookup_column]
                    ua = UserAdaptor(**{'session':
                                        dbsession})  # connect to user adaptor
                    user_exists = ua.check_user_records_email_id(
                        email_id=user_email)
                    if user_exists:  # store data only if user is not existing
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError('Missing or empty required column {0}'.\
                                     format(self.user_lookup_column))
            elif table_name == 'sample':
                if self.sample_lookup_column in data and \
                   not pd.isnull(data[self.sample_lookup_column]):
                    project_igf_id = data[self.project_lookup_column]
                    sample_igf_id = data[self.sample_lookup_column]
                    sa = SampleAdaptor(**{'session': dbsession
                                          })  # connect to sample adaptor
                    sample_project_exists=sa.check_project_and_sample(project_igf_id=project_igf_id,\
                                                                      sample_igf_id=sample_igf_id) # check for existing sample_id and project-id combination
                    if sample_project_exists:  # store data only if sample is not existing
                        data[check_column] = True
                    else:
                        sample_exists = sa.check_sample_records_igf_id(
                            sample_igf_id)  # check for existing sample
                        if sample_exists:
                            raise ValueError('Sample {0} exists in database but not associated with project {1}'.\
                                             format(sample_igf_id,project_igf_id))            # inconsistency in sample project combination
                        data[check_column] = False
                    return data
                else:
                    raise ValueError('Missing or empty required column {0}'.\
                                     format(self.sample_lookup_column))
            elif table_name == 'project_user':
                if self.user_lookup_column in data and \
                    not pd.isnull(data[self.user_lookup_column]) and \
                   self.project_lookup_column in data and \
                    not pd.isnull(data[self.project_lookup_column]):
                    project_igf_id = data[self.project_lookup_column]
                    user_email = data[self.user_lookup_column]
                    pa = ProjectAdaptor(**{'session': dbsession
                                           })  # connect to project adaptor
                    project_user_exists=pa.check_existing_project_user(project_igf_id,\
                                                                       email_id=user_email)
                    if user_email != self.default_user_email and \
                       (self.data_authority_column not in data or \
                       pd.isnull(data[self.data_authority_column])):
                        data[
                            self.
                            data_authority_column] = True  # set user as data authority, filter default user

                    if project_user_exists:  # store data only if sample is not existing
                        data[check_column] = True
                    else:
                        data[check_column] = False
                    return data
                else:
                    raise ValueError('Missing or empty required column {0}, {1}'.\
                                     format(self.project_lookup_column,\
                                            self.user_lookup_column))
            else:
                raise ValueError('table {0} not supported'.format(table_name))
        except:
            raise


    def _notify_about_new_user_account(self,data,user_col='username',\
                             password_col='password',hpc_user_col='hpc_username',\
                             name_col='name',email_id_col='email_id'):
        '''
    An internal method for sending mail to new user with their password
    
    :param data: A pandas series containing user data
    :param user_col: Column name for username, default username
    :param password_col: Column name for password, default password
    :param hpc_user_col: Column name for hpc_username, default hpc_username
    :param name_col: Column name for name, default name
    :param email_id_col: Column name for email id, default email_id
    '''
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a pandas series and got {0}'.\
                                 format(type(data)))
            username = data[user_col]
            fullname = data[name_col]
            password = data[password_col]
            email_id = data[email_id_col]

            if hpc_user_col not in data or pd.isnull(
                    data[hpc_user_col]):  # send email only to non-hpc users
                template_dir = os.path.dirname(self.user_account_template)
                template_env=Environment(loader=FileSystemLoader(searchpath=template_dir), \
                                         autoescape=select_autoescape(['html','xml']))  # set template env
                template_file=template_env.\
                              get_template(os.path.basename(self.user_account_template))
                temp_work_dir = get_temp_dir()  # get a temp dir
                report_output_file = os.path.join(temp_work_dir,
                                                  'email_template.txt')
                template_file.\
                  stream(userEmail=email_id, \
                         fullName=fullname,\
                         userName=username,\
                         userPass=password,\
                        ).\
                  dump(report_output_file)
                read_cmd = ['cat', quote(report_output_file)]
                proc = subprocess.Popen(read_cmd, stdout=subprocess.PIPE)
                sendmail_cmd = [self.sendmail_exe, '-t']
                subprocess.check_call(sendmail_cmd, stdin=proc.stdout)
                proc.stdout.close()
                if proc.returncode != None:
                    raise ValueError('Failed running command {0}:{1}'.format(read_cmd,\
                                                                             proc.returncode))
                remove_dir(temp_work_dir)
        except:
            raise

    @staticmethod
    def _get_user_password(password_length=12):
        '''
    An internal staticmethod for generating random password
    
    :param password_length: Required length of password, default 12
    '''
        try:
            new_password = None  # default value of the new password is None
            symbols = '^!'  # allowed symbols in password
            chars=string.ascii_lowercase+\
                  string.ascii_uppercase+\
                  string.digits+\
                  symbols                                                             # a string of lower case and upper case letters, digits and symbols
            symbol_pattern = re.compile(r'^[{0}]'.format(string.punctuation))
            digit_pattern = re.compile(r'^[0-9]+')
            while new_password is None or \
                  re.match(symbol_pattern,new_password) or \
                  re.match(digit_pattern,new_password):                               # password can't be None or starts with digit or a symbol
                new_password=''.join([chars[ord(os.urandom(1)) % len(chars)] \
                                      for i in range(password_length)])                 # assign a new random password
            return new_password
        except:
            raise


    def _setup_irods_account(self,data,user_col='username',\
                             password_col='password',\
                             hpc_user_col='hpc_username',\
                            ):
        '''
    An internal method for creating new user account in irods

    :param data: A pandas series containing user data
    :param user_col: Column name for username, deffault username
    :param password_col: Column name for password, default password
    :param hpc_user_col: Column name for hpc_username, default hpc_username
    '''
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a pandas series and got {0}'.\
                                 format(type(data)))

            if user_col not in data or pd.isnull(data[user_col]):
                raise ValueError('Missing required username')

            if (hpc_user_col not in data or pd.isnull(data[hpc_user_col])) and \
               (password_col not in data or pd.isnull(data[password_col])):
                raise ValueError('Missing required field password for non-hpc user {0}'.\
                                 format(data[user_col]))

            username = data[user_col]
            hpc_username = data[hpc_user_col]
            password = data[password_col]

            check_cmd1 = ['iadmin', 'lu']
            check_cmd2 = ['grep', '-w', quote(username)]
            c_proc1 = subprocess.Popen(check_cmd1, stdout=subprocess.PIPE)
            c_proc2 = subprocess.Popen(check_cmd2,
                                       stdin=c_proc1.stdout,
                                       stdout=subprocess.PIPE)
            c_proc1.stdout.close()
            if c_proc1.returncode != None:
                raise ValueError('Failed running command {0}:{1}'.format(check_cmd1,\
                                                                         c_proc1.returncode))
            result = c_proc2.communicate()[0]
            result = result.decode('UTF-8')
            if result != '' and pd.isnull(
                    data[hpc_user_col]):  # for non hpc users
                if self.check_hpc_user:
                    raise ValueError('Can not reset iRODS password for non hpc user {0} with check_hpc_user option'.\
                                     format(username))
                else:
                    if password is not None or password != '':
                        irods_passwd_cmd='{0} {1} {2}#{3} {4} {5}'.\
                                         format('iadmin',
                                                'moduser',
                                                quote(username),
                                                'igfZone',
                                                'password',
                                                quote(password))                            # format irods command for shell
                        subprocess.check_call(irods_passwd_cmd, shell=True)
                        if self.log_slack:
                            message='resetting irods account password for non-hpc user: {0}, password length: {1}'.\
                                  format(username,len(password))
                            self.igf_slack.post_message_to_channel(
                                message, reaction='pass')
                    else:
                        raise ValueError('Missing password for non-hpc user {0}'.\
                                         format(quote(username)))
            elif result == '':
                irods_mkuser_cmd=['iadmin', 'mkuser', \
                                  '{0}#igfZone'.format(quote(username)), 'rodsuser']
                subprocess.check_call(irods_mkuser_cmd)  # create irods user
                irods_chmod_cmd=['ichmod', '-M', 'own', 'igf', \
                               '/igfZone/home/{0}'.format(quote(username))]
                subprocess.check_call(
                    irods_chmod_cmd)  # change permission for irods user
                irods_inherit_cmd=['ichmod','-r', 'inherit', \
                                   '/igfZone/home/{0}'.format(quote(username))]
                subprocess.check_call(irods_inherit_cmd)  # inherit irods user

                if (hpc_username is None or hpc_username == '' ) and \
                   (password is not None or password != ''):
                    if len(password) > 20:
                        raise ValueError('check password for non hpc user {0}: {1}'.\
                                         format(username,password))                         # it could be the encrypted password

                    irods_passwd_cmd='{0} {1} {2}#{3} {4} {5}'.\
                                       format('iadmin',
                                              'moduser',
                                              quote(username),
                                              'igfZone',
                                              'password',
                                              quote(password))                            # format irods command for shell
                    subprocess.check_call(
                        irods_passwd_cmd,
                        shell=True)  # set password for non-hpc user
                    if self.log_slack:
                        message='created irods account for non-hpc user: {0}'.\
                                format(username)
                        self.igf_slack.post_message_to_channel(message,
                                                               reaction='pass')
        except:
            raise

    def _get_hpc_username(self, username):
        '''
    An internal method for checking hpc accounts for new users
    This method is not reliable as the ldap server can be down from time to time

    :param username: A username string
    '''
        try:
            cmd1=['ssh', \
                 '{0}@{1}'.format(quote(self.hpc_user),quote(self.hpc_address)), \
                 'ldapsearch -x -h {0}'.format(quote(self.ldap_server)), \
                ]
            cmd2=['grep',\
                  '-w',\
                  'uid: {0}'.format(quote(username)), \
                 ]
            proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
            proc2 = subprocess.Popen(cmd2,
                                     stdin=proc1.stdout,
                                     stdout=subprocess.PIPE)
            proc1.stdout.close()
            if proc1.returncode != None:
                raise ValueError('Failed running command {0}:{1}'.format(cmd1,\
                                                                         proc1.returncode))
            result = proc2.communicate()[0]
            result = result.decode('UTF-8')
            if result == '':
                hpc_username = None
            else:
                hpc_username = username
            return hpc_username
        except:
            raise


    def _assign_username_and_password(self,data,user_col='username',\
                                      hpc_user_col='hpc_username',\
                                      password_col='password',\
                                      email_col='email_id',
                                      hpc_category='HPC_USER',
                                      category_col='category'):
        '''
    An internal method for assigning new user account and password

    :param data: A pandas series containing user data
    :param user_col: Column name for username, deffault username
    :param password_col: Column name for password, default password
    :param hpc_user_col: Column name for hpc_username, default hpc_username
    :param email_id_col: Column name for email id, default email_id
    :param category_col: Column name for user category, default category
    :param hpc_category: Category tag for hpc user, default: HPC_USER
    '''
        try:
            if not isinstance(data, pd.Series):
                raise ValueError('Expecting a pandas series and got {0}'.\
                                 format(type(data)))

            if (user_col not in data or pd.isnull(data[user_col])) and \
               (hpc_user_col in data and not pd.isnull(data[hpc_user_col])):          # if hpc username found, make it username
                data[user_col] = data[hpc_user_col]

            if (user_col not in data or (user_col in data and pd.isnull(
                    data[user_col]))):  # assign username from email id
                username, _ = data[email_col].split(
                    '@', 1)  # get username from email id
                data[user_col]=username[:10] if len(username)>10 \
                                             else username                              # allowing only first 10 chars of the email id

            if (hpc_user_col not in data or pd.isnull(data[hpc_user_col])) and \
               self.check_hpc_user:                                                   # assign hpc username
                hpc_username = self._get_hpc_username(username=data[user_col])
                data[hpc_user_col] = hpc_username  # set hpc username

            if user_col in data and not pd.isnull(data[user_col]) and \
               hpc_user_col in data and not pd.isnull(data[hpc_user_col]) and \
               data[user_col] != data[hpc_user_col]:                                  # if user name and hpc username both are present, they should be same
                raise ValueError('username {0} and hpc_username {1} should be same'.\
                                 format(data[user_col],data[hpc_user_col]))

            if (hpc_user_col not in data or pd.isnull(data[hpc_user_col])) and \
               (password_col not in data or pd.isnull(data[password_col])):
                data[password_col] = self._get_user_password(
                )  # assign a random password if its not supplied

            if (category_col not in data or pd.isnull(data[category_col])) and \
               (hpc_user_col in data and not pd.isnull(data[hpc_user_col])):          # set user category for hpc users
                data[category_col] = hpc_category
            return data
        except:
            raise

    def _add_default_user_to_project(self, project_user_data):
        '''
    An internal method for adding default user to the project_user_data dataframe

    :param project_user_data: A dataframe containing project_igf_id and email_id column
    :returns: a pandas dataframe with new row for the project_igf_id and default_user_email
    '''
        try:
            new_project_user_data = list()
            for row in project_user_data.to_dict(orient='records'):
                new_project_user_data.append(row)
                row2 = deepcopy(row)
                row2[self.user_lookup_column] = self.default_user_email
                new_project_user_data.append(row2)
            new_project_user_data = pd.DataFrame(new_project_user_data)
            return new_project_user_data
        except:
            raise

    def _check_and_register_data(self, data, project_info_file):
        '''
    An internal method for checking and registering data

    :param data: A dictionary containing following keys
    
          project_data
          user_data
          project_user_data
          sample_data
    :param project_info_file: A filepath for project info
    '''
        try:
            db_connected = False
            project_data = pd.DataFrame(data['project_data'])
            user_data = pd.DataFrame(data['user_data'])
            project_user_data = pd.DataFrame(data['project_user_data'])
            sample_data = pd.DataFrame(data['sample_data'])
            base = BaseAdaptor(**{'session_class': self.session_class})
            base.start_session()  # connect_to db
            db_connected = True
            project_data = project_data[project_data[
                self.project_lookup_column].isnull() == False]
            project_data = project_data.drop_duplicates()
            if project_data.index.size > 0:
                project_data=project_data.\
                             apply(lambda x: \
                                   self._check_existing_data(\
                                      data=x,\
                                      dbsession=base.session, \
                                      table_name='project',
                                      check_column='EXISTS'),\
                                   axis=1)                                              # get project map
                project_data = project_data[project_data['EXISTS'] ==
                                            False]  # filter existing projects
                project_data.drop('EXISTS', axis=1,
                                  inplace=True)  # remove extra column

            user_data = user_data[user_data[self.user_lookup_column].isnull()
                                  == False]
            user_data = user_data.drop_duplicates()
            if user_data.index.size > 0:
                user_data=user_data.apply(lambda x: \
                                        self._assign_username_and_password(x), \
                                        axis=1)                                         # check for use account and password
                user_data=user_data.\
                          apply(lambda x: \
                                self._check_existing_data(\
                                      data=x,\
                                      dbsession=base.session, \
                                      table_name='user',
                                      check_column='EXISTS'),\
                                axis=1)                                                 # get user map
                user_data = user_data[user_data['EXISTS'] ==
                                      False]  # filter existing users
                user_data.drop('EXISTS', axis=1,
                               inplace=True)  # remove extra column

            sample_data = sample_data[sample_data[
                self.sample_lookup_column].isnull() == False]
            sample_data = sample_data.drop_duplicates()
            if sample_data.index.size > 0:
                sample_data=sample_data.\
                             apply(lambda x: \
                                   self._check_existing_data(\
                                      data=x,\
                                      dbsession=base.session, \
                                      table_name='sample',
                                      check_column='EXISTS'),\
                                   axis=1)                                              # get sample map
                sample_data = sample_data[sample_data['EXISTS'] ==
                                          False]  # filter existing samples
                sample_data.drop('EXISTS', axis=1,
                                 inplace=True)  # remove extra column

            project_user_data = project_user_data.drop_duplicates()
            project_user_data_mask=(project_user_data[self.project_lookup_column].isnull()==False) & \
                                   (project_user_data[self.user_lookup_column].isnull()==False)
            project_user_data = project_user_data[
                project_user_data_mask]  # not allowing any empty values for project or user lookup
            if project_user_data.index.size > 0:
                project_user_data = self._add_default_user_to_project(
                    project_user_data
                )  # update project_user_data with default users
                project_user_data=project_user_data.\
                                  apply(lambda x: \
                                   self._check_existing_data(\
                                      data=x,\
                                      dbsession=base.session, \
                                      table_name='project_user',
                                      check_column='EXISTS'),\
                                   axis=1)                                              # get project user map
                project_user_data = project_user_data[project_user_data[
                    'EXISTS'] == False]  # filter existing project user
                project_user_data.drop('EXISTS', axis=1,
                                       inplace=True)  # remove extra column

            if len(project_data.index) > 0:  # store new projects
                pa1 = ProjectAdaptor(**{'session': base.session
                                        })  # connect to project adaptor
                pa1.store_project_and_attribute_data(
                    data=project_data, autosave=False)  # load project data

            if len(user_data.index) > 0:  # store new users
                ua = UserAdaptor(**{'session': base.session})
                ua.store_user_data(data=user_data,
                                   autosave=False)  # load user data

            if len(project_user_data.index) > 0:  # store new project users
                pa2 = ProjectAdaptor(**{'session': base.session
                                        })  # connect to project adaptor
                project_user_data = project_user_data.to_dict(
                    orient='records')  # convert dataframe to dictionary
                pa2.assign_user_to_project(
                    data=project_user_data,
                    autosave=False)  # load project user data

            if len(sample_data.index) > 0:  # store new samples
                sa = SampleAdaptor(**{'session': base.session
                                      })  # connect to sample adaptor
                sa.store_sample_and_attribute_data(
                    data=sample_data, autosave=False)  # load samples data

            if self.setup_irods:
                user_data.apply(lambda x: self._setup_irods_account(data=x),
                                axis=1)  # create irods account

            file_checksum = calculate_file_checksum(filepath=project_info_file)
            file_size = os.path.getsize(project_info_file)
            file_data=[{'file_path':project_info_file,\
                        'location':'ORWELL',\
                        'md5':file_checksum,\
                        'size':file_size,\
                      }]
            fa = FileAdaptor(**{'session':
                                base.session})  # connect to file adaptor
            fa.store_file_data(data=file_data, autosave=False)

        except:
            if db_connected:
                base.rollback_session()  # rollback session
            raise
        else:
            if db_connected:
                base.commit_session()  # commit changes to db
                if len(user_data.index) > 0 and self.notify_user:
                    user_data.apply(lambda x: self._notify_about_new_user_account(x),\
                                    axis=1)                                               # send mail to new user with their password and forget it
        finally:
            if db_connected:
                base.close_session()  # close db connection

    def _check_and_add_project_attributes(self, data_series):
        '''
    An internal method for checking project data and adding required attributes
    
    :param data_series: A Pandas Series containing project data
    :returns: A Pandas series with project attribute information
    '''
        try:
            if not isinstance(data_series, pd.Series):
                raise AttributeError('Expecting a Pandas Series and got {0}'.\
                                     format(type(data_series)))

            if self.barcode_check_keyword not in data_series or  \
               pd.isnull(data_series[self.barcode_check_keyword]):
                data_series[
                    self.
                    barcode_check_keyword] = 'ON'  # by default barcode checking is always ON
            return data_series
        except:
            raise

    def _read_project_info_and_get_new_entries(self, project_info_file):
        '''
    An internal method for processing project info data
    
    :param project_info_file: A filepath for project_info csv files
    
    :returns: A dictionary with following keys
    
          project_data
          user_data
          project_user_data
          sample_data
    '''
        try:
            if fnmatch.fnmatch(project_info_file, '*.csv'):
                project_info_data = pd.read_csv(
                    project_info_file)  # read project info data from csv file
            elif fnmatch.fnmatch(project_info_file, '*xls'):
                xl = pd.ExcelFile(project_info_file)
                if self.metadata_sheet_name not in xl.sheet_names:  # check for required metadata sheet name
                    raise ValueError('Excel file does not have the sheet {0}'.\
                                     format(self.metadata_sheet_name))
                project_info_data = xl.parse(
                    self.metadata_sheet_name
                )  # read xls file from the metadata sheet
            else:
                raise ValueError('No parser defined for file {0}'.\
                                 format(project_info_file))

            base = BaseAdaptor(**{'session_class': self.session_class})
            required_project_columns=base.get_table_columns(table_name=Project, \
                                                   excluded_columns=['project_id'])   # get project columns
            required_project_columns.append(
                self.barcode_check_keyword
            )  # add barcode check param to project attribute table
            required_user_columns=base.get_table_columns(table_name=User, \
                                                   excluded_columns=['user_id'])      # get user columns
            required_project_user_columns = ['project_igf_id', 'email_id'
                                             ]  # get project user columns
            project_data = project_info_data.loc[:,
                                                 required_project_columns]  # get data for project table
            user_data = project_info_data.loc[:,
                                              required_user_columns]  # get data for user table
            project_user_data = project_info_data.loc[:,
                                                      required_project_user_columns]  # get data for project user table
            required_sample_columns=list(set(project_info_data.columns).\
                                         difference(set(list(project_data)+\
                                                        list(user_data)+\
                                                        list(project_user_data))))    # all remaining column goes to sample tables
            required_sample_columns.append('project_igf_id')
            sample_data = project_info_data.loc[:,
                                                required_sample_columns]  # get data for sample table
            project_data = project_data.drop_duplicates()
            project_data=project_data.apply(lambda x: \
                                            self._check_and_add_project_attributes(x),
                                            axis=1)                                   # add missing project attribute to the dataframe
            project_data['project_igf_id']=project_data['project_igf_id'].\
                                           map(lambda x: x.replace(' ',''))           # replace any white space from project igf id

            user_data = user_data.drop_duplicates()
            user_data['email_id']=user_data['email_id'].\
                                  map(lambda x: x.replace(' ',''))                    # replace any white space from email id
            if 'name' in user_data.columns:
                user_data['name'].fillna('', inplace=True)
                user_data['name']=user_data['name'].\
                                  map(lambda x: x.title())                              # reformat name, if its present

            project_user_data = project_user_data.drop_duplicates()
            project_user_data['project_igf_id']=project_user_data['project_igf_id'].\
                                                map(lambda x: x.replace(' ',''))      # replace any white space from project igf id
            project_user_data['email_id']=project_user_data['email_id'].\
                                          map(lambda x: x.replace(' ',''))            # replace any white space from email id

            sample_data = sample_data.drop_duplicates(
            )  # remove duplicate entries
            sample_data['project_igf_id']=sample_data['project_igf_id'].\
                                          map(lambda x: x.replace(' ',''))            # replace any white space from project igf id
            sample_data['sample_igf_id']=sample_data['sample_igf_id'].\
                                          map(lambda x: x.replace(' ',''))            # replace any white space from sample igf id

            if self.project_lookup_column not in project_data.columns:
                raise ValueError('Missing required column: {0}'.\
                                 format(self.project_lookup_column))
            if self.user_lookup_column not in user_data.columns:
                raise ValueError('Missing required column: {0}'.\
                                 format(self.user_lookup_column))
            if self.sample_lookup_column not in sample_data.columns:
                raise ValueError('Missing required column: {0}'.\
                                 format(self.sample_lookup_column))                     # check if required columns are present in the dataframe

            return {'project_data':project_data,\
                    'user_data':user_data,\
                    'project_user_data':project_user_data,\
                    'sample_data':sample_data}
        except:
            raise

    def _find_new_project_info(self):
        '''
    An internal method for fetching new project info file
    It returns a list one new project info file
    '''
        try:
            new_project_info_list = list()
            fa = FileAdaptor(**{'session_class': self.session_class})
            fa.start_session()  # connect to db
            for root_path, _, files in os.walk(self.projet_info_path,
                                               topdown=True):
                for file_path in files:
                    if fnmatch.fnmatch(file_path, '*.csv') or \
                       fnmatch.fnmatch(file_path, '*xls'):                                # only consider csv or xls files
                        file_check = fa.check_file_records_file_path(
                            file_path=os.path.join(
                                root_path,
                                file_path))  # check for filepath in db
                        if not file_check:
                            new_project_info_list.append(
                                os.path.join(root_path, file_path)
                            )  # collect new project info files
            fa.close_session()  # disconnect db
            return new_project_info_list
        except:
            raise
import argparse
from igf_data.task_tracking.igf_slack import IGF_slack
from igf_data.process.project_info.project_pooling_info import Project_pooling_info

parser = argparse.ArgumentParser()
parser.add_argument('-d','--dbconfig', required=True, help='Database configuration file path')
parser.add_argument('-n','--slack_config', required=True, help='Slack configuration file path')
parser.add_argument('-o','--output', required=True, help='Gviz json output path')
args = parser.parse_args()

dbconfig = args.dbconfig
slack_config = args.slack_config
output = args.output

if __name__=='__main__':
  try:
    slack_obj = IGF_slack(slack_config=slack_config)
    pp = Project_pooling_info(dbconfig_file=dbconfig)
    pp.fetch_db_data_and_prepare_gviz_json(output_file_path=output)
    message = 'Updated project pooling stats'
    slack_obj.\
      post_message_to_channel(
        message=message,
        reaction='pass')
  except Exception as e:
    message = 'Failed to updated project pooling stats'
    slack_obj.\
      post_message_to_channel(
        message=message,
        reaction='fail')
    raise ValueError(message)
                    action='append',
                    default=[],
                    help='List of sub directories excluded from the search')
args = parser.parse_args()

seqrun_path = args.seqrun_path
md5_path = args.md5_path
dbconfig_path = args.dbconfig_path
slack_config = args.slack_config
asana_config = args.asana_config
asana_project_id = args.asana_project_id
pipeline_name = args.pipeline_name
exclude_path = args.exclude_path
samplesheet_json_schema = args.samplesheet_json_schema

slack_obj = IGF_slack(slack_config=slack_config)
asana_obj = IGF_asana(asana_config=asana_config,
                      asana_project_id=asana_project_id)

if __name__ == '__main__':
    try:
        new_seqruns = find_new_seqrun_dir(seqrun_path, dbconfig_path)
        new_seqruns,message = \
          check_for_registered_project_and_sample(
            seqrun_info=new_seqruns,
            dbconfig=dbconfig_path)
        if message != '':
            msg_tmp_dir = get_temp_dir()  # create temp dir
            time_tuple = datetime.now().timetuple()  # get timetuple for NOW
            time_stamp = \
              '{0}_{1}_{2}-{3}_{4}_{5}'.\
                    help='Update existing flowcell rules data, default: False')
parser.add_argument('-d',
                    '--dbconfig_path',
                    required=True,
                    help='Database configuration json file')
parser.add_argument('-s',
                    '--slack_config',
                    required=True,
                    help='Slack configuration json file')
args = parser.parse_args()

dbconfig_path = args.dbconfig_path
slack_config = args.slack_config
flowcell_data = args.flowcell_data
update_data = args.update

slack_obj = IGF_slack(slack_config=slack_config)

if __name__ == '__main__':
    try:
        if update_data:
            raise NotImplementedError(
                'methods notavailable for updaing existing data')
        else:
            load_new_flowcell_data(data_file=flowcell_data,
                                   dbconfig=dbconfig_path)
    except Exception as e:
        message = 'Failed to load data to flowcell rules table, error: {0}'.format(
            e)
        slack_obj.post_message_to_channel(message, reaction='fail')
        raise ValueError(message)
parser = argparse.ArgumentParser()
parser.add_argument('-p',
                    '--seqrun_data',
                    required=True,
                    help='Seqrun data json file')
parser.add_argument('-d',
                    '--dbconfig_path',
                    required=True,
                    help='Database configuration json file')
parser.add_argument('-s',
                    '--slack_config',
                    required=True,
                    help='Slack configuration json file')
args = parser.parse_args()

dbconfig_path = args.dbconfig_path
slack_config = args.slack_config
seqrun_data = args.seqrun_data

slack_obj = IGF_slack(slack_config=slack_config)

if __name__ == '__main__':
    try:
        load_new_seqrun_data(data_file=seqrun_data, dbconfig=dbconfig_path)
    except Exception as e:
        message = 'Failed to load data to seqrun table, error: {0}'.format(e)
        slack_obj.post_message_to_channel(message, reaction='fail')
        raise ValueError(message)
    else:
        slack_obj.post_message_to_channel(
            message='Loaded new seqrun info to db', reaction='pass')
Example #14
0
class Reset_samplesheet_md5:
    '''
  A class for modifying samplesheet md5 for seqrun data processing
  '''
    def __init__(self,
                 seqrun_path,
                 seqrun_igf_list,
                 dbconfig_file,
                 clean_up=True,
                 json_collection_type='ILLUMINA_BCL_MD5',
                 log_slack=True,
                 log_asana=True,
                 slack_config=None,
                 asana_project_id=None,
                 asana_config=None,
                 samplesheet_name='SampleSheet.csv'):
        '''
    :param seqrun_path: A directory path for sequencing run home
    :param seqrun_igf_list: A file path listing sequencing runs to reset
    :param dbconfig_file: A file containing the database configuration
    :param clean_up: Clean up input file once its processed, default True
    :param json_collection_type: A collection type for md5 json file lookup, default ILLUMINA_BCL_MD5
    :param log_slack: A boolean flag for toggling Slack messages, default True
    :param log_asana: Aboolean flag for toggling Asana message, default True
    :param slack_config: A file containing Slack tokens, default None
    :param asana_config: A file containing Asana tokens, default None
    :param asana_project_id: A numeric Asana project id, default is None
    :param samplesheet_name: Name of the samplesheet file, default SampleSheet.csv
    '''
        try:
            self.seqrun_path = seqrun_path
            self.seqrun_igf_list = seqrun_igf_list
            self.json_collection_type = json_collection_type
            self.log_slack = log_slack
            self.log_asana = log_asana
            self.clean_up = clean_up
            self.samplesheet_name = samplesheet_name
            dbparams = read_dbconf_json(dbconfig_file)
            self.base_adaptor = BaseAdaptor(**dbparams)
            if log_slack and slack_config is None:
                raise ValueError('Missing slack config file')
            elif log_slack and slack_config:
                self.igf_slack = IGF_slack(slack_config)  # add slack object

            if log_asana and \
               (asana_config is None or \
                asana_project_id is None):
                raise ValueError(
                    'Missing asana config file or asana project id')
            elif log_asana and asana_config and asana_project_id:
                self.igf_asana = IGF_asana(
                    asana_config, asana_project_id)  # add asana object
        except:
            raise

    def _get_samplesheet_md5(self, seqrun_igf_id):
        '''
    An internal method for calculating md5 value for updated samplesheet file
    :param seqrun_igf_id: A string of seqrun_igf_id
    :return string: MD5 value of the samplesheet.csv file
    '''
        try:
            samplesheet_path = os.path.join(self.seqrun_path, seqrun_igf_id,
                                            self.samplesheet_name)
            if not os.path.exists(samplesheet_path):
                raise IOError('Samplesheet not found for seqrun {0}'.\
                              format(seqrun_igf_id))
            return calculate_file_checksum(filepath=samplesheet_path,
                                           hasher='md5')
        except:
            raise

    @staticmethod
    def _get_updated_json_file(json_file_path,
                               samplesheet_md5,
                               samplesheet_name,
                               file_field='seqrun_file_name',
                               md5_field='file_md5'):
        '''
    A static method for checking samplesheet md5 value in json file and create
    a new copy of json file with updated md5, if samplesheet has changed
    :param json_file_path: A file path for seqrun md5 json file
    :param samplesheet_md5: A md5 value for samplesheet file
    :param samplesheet_name: Name of the samplesheet file
    :param file_field: A keyword for filename loop up in json file, default seqrun_file_name
    :param md5_field: A keyword for md5 value look up in json file, default file_md5
    :returns A string filepath if samplesheet has been updated or None
    '''
        try:
            if not os.path.exists(json_file_path):
                raise IOError(
                    'Json md5 file {0} not found'.format(json_file_path))

            create_new_file = False  # don't create new json by default
            json_data = list()
            with open(json_file_path, 'r') as jp:
                json_data = json.load(jp)  # load data from json file

            for json_row in json_data:
                if json_row[file_field]==samplesheet_name and \
                   json_row[md5_field]!=samplesheet_md5:
                    json_row[
                        md5_field] = samplesheet_md5  # update json data with new md5
                    create_new_file = True  # create new json if md5 values are not matching
                    break
                    # stop file look up

            if create_new_file:
                temp_dir = get_temp_dir()
                json_file_name = os.path.basename(
                    json_file_path)  # get original json filename
                temp_json_file = os.path.join(
                    temp_dir, json_file_name)  # get temp file path
                with open(temp_json_file, 'w') as jwp:
                    json.dump(json_data, jwp,
                              indent=4)  # write data to temp file
                return temp_json_file  # return file path
            else:
                return None  # return none
        except:
            raise

    def run(self):
        '''
    A method for resetting md5 values in the samplesheet json files for all seqrun ids
    '''
        try:
            db_connected = False
            seqrun_list = self._read_seqrun_list(
                self.seqrun_igf_list
            )  # fetch list of seqrun ids from input file
            if len(seqrun_list) > 0:
                base = self.base_adaptor
                base.start_session()  # connect to database
                db_connected = True
                ca = CollectionAdaptor(**{'session': base.session
                                          })  # connect to collection table
                fa = FileAdaptor(**{'session':
                                    base.session})  # connect to file table
                for seqrun_id in seqrun_list:
                    try:
                        files_data = ca.get_collection_files(
                            collection_name=seqrun_id,
                            collection_type=self.json_collection_type,
                            output_mode='one_or_none'
                        )  # check for existing md5 json file in db
                        # TO DO: skip seqrun_id if pipeline is still running
                        if files_data is not None:
                            json_file_path = [
                                element.file_path for element in files_data
                                if isinstance(element, File)
                            ][0]  # get md5 json file path from sqlalchemy collection results
                            samplesheet_md5 = self._get_samplesheet_md5(
                                seqrun_id
                            )  # get md5 value for new samplesheet file
                            new_json_path = self._get_updated_json_file(
                                json_file_path, samplesheet_md5,
                                self.samplesheet_name
                            )  # get updated md5 json file if samplesheet has been changed
                            if new_json_path is not None:
                                new_json_file_md5 = calculate_file_checksum(
                                    filepath=new_json_path, hasher='md5')
                                fa.update_file_table_for_file_path(
                                    file_path=json_file_path,
                                    tag='md5',
                                    value=new_json_file_md5,
                                    autosave=False
                                )  # update json file md5 in db, don't commit yet
                                move_file(source_path=new_json_path,
                                          destinationa_path=json_file_path,
                                          force=True)  # overwrite json file
                                base.commit_session()  # save changes in db
                                message='Setting new Samplesheet info for run {0}'.\
                                        format(seqrun_id)
                                if self.log_slack:
                                    self.igf_slack.post_message_to_channel(
                                        message,
                                        reaction='pass')  # send log to slack
                                if self.log_asana:
                                    self.igf_asana.comment_asana_task(
                                        task_name=seqrun_id,
                                        comment=message)  # send log to asana
                            else:
                                message = 'no change in samplesheet for seqrun {0}'.format(
                                    seqrun_id)
                                warnings.warn(message)
                                if self.log_slack:
                                    self.igf_slack.post_message_to_channel(
                                        message, reaction='pass')
                        else:
                            message='No md5 json file found for seqrun_igf_id: {0}'.\
                                    format(seqrun_id)
                            warnings.warn(
                                message
                            )  # not raising any exception if seqrun id is not found
                            if self.log_slack:
                                self.igf_slack.post_message_to_channel(
                                    message, reaction='fail')
                    except Exception as e:
                        base.rollback_session()
                        message='Failed to update  json file for seqrun id {0}, error : {1}'.\
                                format(seqrun_id,e)
                        warnings.warn(message)
                        if self.log_slack:
                            self.igf_slack.post_message_to_channel(
                                message, reaction='fail')
                base.close_session()  # close db connection
                if self.clean_up:
                    self._clear_seqrun_list(
                        self.seqrun_igf_list)  # clear input file
            else:
                if self.log_slack:
                    message = 'No new seqrun id found for changing samplesheet md5'
                    warnings.warn(message)
                    if self.log_slack:
                        self.igf_slack.post_message_to_channel(
                            message, reaction='sleep')
        except:
            if db_connected:
                base.rollback_session()
                base.close_session()
            raise

    @staticmethod
    def _clear_seqrun_list(seqrun_igf_list):
        '''
    A static method for clearing the seqrun list file
    :param seqrun_igf_list: A file containing the sequencing run ids
    '''
        try:
            if not os.path.exists(seqrun_igf_list):
                raise IOError('File {0} not found'.format(seqrun_igf_list))

            with open(seqrun_igf_list, 'w') as fwp:
                fwp.write('')  # over write seqrun list file
        except:
            raise

    @staticmethod
    def _read_seqrun_list(seqrun_igf_list):
        '''
    A static method for reading list of sequencing run ids from a n input file 
    to a list
    :param seqrun_igf_list: A file containing the sequencing run ids
    :return list: A list of seqrun ids from the input file
    '''
        try:
            if not os.path.exists(seqrun_igf_list):
                raise IOError('File {0} not found'.format(seqrun_igf_list))

            seqrun_ids = list()  # define an empty list of seqrun ids
            with open(seqrun_igf_list, 'r') as fp:
                seqrun_ids = [i.strip()
                              for i in fp]  # add seqrun ids to the list
            return seqrun_ids
        except:
            raise
class Experiment_metadata_updator:
  '''
  A class for updating metadata for experiment table in database
  '''
  def __init__(self,dbconfig_file,log_slack=True,slack_config=None):
    '''
    :param dbconfig_file: A database configuration file path
    :param log_slack: A boolean flag for toggling Slack messages, default True
    :param slack_config: A file containing Slack tokens, default None
    '''
    try:
      dbparams = read_dbconf_json(dbconfig_file)
      self.base_adaptor=BaseAdaptor(**dbparams)
      self.log_slack=log_slack
      if log_slack and slack_config is None:
        raise ValueError('Missing slack config file')
      elif log_slack and slack_config:
        self.igf_slack = IGF_slack(slack_config)                                # add slack object
    except:
      raise

  @staticmethod
  def _text_sum(a=None):
    if isinstance(a,list):
      return ';'.join(a)
    else:
      return a


  def update_metadta_from_sample_attribute(self,experiment_igf_id=None,
                                           sample_attribute_names=('library_source',
                                                                   'library_strategy',
                                                                   'experiment_type')):
    '''
    A method for fetching experiment metadata from sample_attribute tables
    :param experiment_igf_id: An experiment igf id for updating only a selected experiment, default None for all experiments
    :param sample_attribute_names: A list of sample attribute names to look for experiment metadata,
                                   default: library_source, library_strategy, experiment_type
    '''
    try:
      sample_attribute_names = list(sample_attribute_names)
      db_connected=False
      base=self.base_adaptor
      base.start_session()
      db_connected=True
      query=base.session.\
            query(Experiment.experiment_igf_id).\
            distinct(Experiment.experiment_id).\
            join(Sample).\
            join(Sample_attribute).\
            filter(Sample.sample_id==Experiment.sample_id).\
            filter(Sample.sample_id==Sample_attribute.sample_id).\
            filter(Experiment.library_source=='UNKNOWN').\
            filter(Experiment.library_strategy=='UNKNOWN').\
            filter(Experiment.experiment_type=='UNKNOWN').\
            filter(Sample_attribute.attribute_value.notin_('UNKNOWN')).\
            filter(Sample_attribute.attribute_name.in_(sample_attribute_names)) # base query for db lookup
      if experiment_igf_id is not None:
        query=query.filter(Experiment.experiment_igf_id==experiment_igf_id)     # look for specific experiment_igf_id

      exp_update_count=0
      exps=base.fetch_records(query, output_mode='object')                      # fetch exp records as generator expression
      for row in exps:
        experiment_id=row[0]
        ea=ExperimentAdaptor(**{'session':base.session})
        attributes=ea.fetch_sample_attribute_records_for_experiment_igf_id(experiment_igf_id=experiment_id, 
                                                                output_mode='object',
                                                                attribute_list=sample_attribute_names)
        exp_update_data=dict()
        for attribute_row in attributes:
          exp_update_data.update({attribute_row.attribute_name:attribute_row.attribute_value})

        if len(exp_update_data.keys())>0:
          exp_update_count+=1
          ea.update_experiment_records_by_igf_id(experiment_igf_id=experiment_id,
                                                 update_data=exp_update_data,
                                                 autosave=False)                # update experiment entry if attribute records are found

      base.commit_session()
      base.close_session()
      db_connected=False
      if self.log_slack:
        message='Update {0} experiments from sample attribute records'.\
                format(exp_update_count)
        self.igf_slack.post_message_to_channel(message=message,
                                               reaction='pass')
    except Exception as e:
      if db_connected:
        base.rollback_session()
        base.close_session()
        message='Error while updating experiment records: {0}'.format(e)
        warnings.warn(message)
        if self.log_slack:
          self.igf_slack.post_message_to_channel(message=message,
                                                 reaction='fail')
      raise
Example #16
0
asana_config = args.asana_config
asana_project_id = args.asana_project_id
pipeline_name = args.pipeline_name
fastq_type = args.fastq_type
project_name_file = args.project_name_file
species_name = args.species_name
library_source = args.library_source
reset_project_list = args.reset_project_list

if __name__=='__main__':
  try:
    if not os.path.exists(project_name_file):
      raise IOError('File {0} not found'.\
                    format(project_name_file))

    slack_obj = IGF_slack(slack_config=slack_config)                            # get slack instance
    asana_obj = IGF_asana(asana_config=asana_config,
                        asana_project_id=asana_project_id)                      # get asana object
    available_projects,seeded_projects = \
      find_new_analysis_seeds(
        dbconfig_path=dbconfig_path,
        pipeline_name=pipeline_name,
        project_name_file=project_name_file,
        species_name_list=species_name,
        fastq_type=fastq_type,
        library_source_list=library_source)
    if available_projects is not None:
      message = 'New projects available for seeding: {0}'.\
                format(available_projects)
      slack_obj.\
        post_message_to_channel(