Esempio n. 1
0
def update_status_db(data, server_type=None):
    """ Pushed the data to status db.

    data can be from nases
    server_type should be 'nas'.
    """
    db_config = CONFIG.get('statusdb')
    if db_config is None:
        logging.error('"statusdb" must be present in the config file!')
        raise RuntimeError('"statusdb" must be present in the config file!')
    try:
        couch_connection = statusdb.StatusdbSession(db_config).connection
    except Exception as e:
        logging.error(e.message)
        raise

    db = couch_connection['server_status']
    logging.info('Connection established')
    for key in data.keys():  # data is dict of dicts
        server = data[key]  # data[key] is dictionary (the command output)
        server['name'] = key  # key is nas url
        # datetime.datetime(2015, 11, 18, 9, 54, 33, 473189) is not JSON serializable
        server['time'] = datetime.datetime.now().isoformat()
        server['server_type'] = server_type or 'unknown'

        try:
            db.save(server)
        except Exception as e:
            logging.error(e.message)
            raise
        else:
            logging.info('{}: Server status has been updated'.format(key))
Esempio n. 2
0
def run_is_demuxed(run, couch_info=None):
    """Check in StatusDB 'x_flowcells' database if the given run has an entry which means it was
    demultiplexed (as TACA only creates a document upon successfull demultiplexing)

    :param str run: run name
    :param dict couch_info: a dict with 'statusDB' info
    """
    if not couch_info:
        raise SystemExit(
            'To check for demultiplexing is enabled in config file but no "statusDB" info was given'
        )
    run_terms = run.split('_')
    run_date = run_terms[0]
    run_fc = run_terms[-1]
    run_name = '{}_{}'.format(run_date, run_fc)
    try:
        couch_connection = statusdb.StatusdbSession(couch_info).connection
        fc_db = couch_connection[couch_info['db']]
        for fc in fc_db.view('names/name', reduce=False, descending=True):
            if fc.key != run_name:
                continue
            fc_doc = fc_db.get(fc.id)
            if not fc_doc or not fc_doc.get('illumina', {}).get(
                    'Demultiplex_Stats', {}):
                return False
            return True
    except Exception as e:
        raise e
Esempio n. 3
0
def _upload_to_statusdb(run):
    """Triggers the upload to statusdb using the dependency flowcell_parser.

    :param Run run: the object run
    """
    couch_conf = CONFIG['statusdb']
    couch_connection = statusdb.StatusdbSession(couch_conf).connection
    db = couch_connection[couch_conf['xten_db']]
    parser = run.runParserObj
    # Check if I have NoIndex lanes
    for element in parser.obj['samplesheet_csv']:
        if 'NoIndex' in element['index'] or not element[
                'index']:  # NoIndex in the case of HiSeq, empty in the case of HiSeqX
            lane = element['Lane']  # This is a lane with NoIndex
            # In this case PF Cluster is the number of undetermined reads
            try:
                PFclusters = parser.obj['Undetermined'][lane]['unknown']
            except KeyError:
                logger.error('While taking extra care of lane {} of NoIndex type ' \
                             'I found out that not all values were available'.format(lane))
                continue
            # In Lanes_stats fix the lane yield
            parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][
                int(lane) - 1]['PF Clusters'] = str(PFclusters)
            # Now fix Barcode lane stats
            updated = 0  # Check that only one update is made
            for sample in parser.obj['illumina']['Demultiplex_Stats'][
                    'Barcode_lane_statistics']:
                if lane in sample['Lane']:
                    updated += 1
                    sample['PF Clusters'] = str(PFclusters)
            if updated != 1:
                logger.error(
                    'While taking extra care of lane {} of NoIndex type '
                    'I updated more than once the barcode_lane. '
                    'This is too much to continue so I will fail.'.format(
                        lane))
                os.sys.exit()
            # If I am here it means I changed the HTML representation to something
            # else to accomodate the wired things we do
            # someone told me that in such cases it is better to put a place holder for this
            parser.obj['illumina']['Demultiplex_Stats']['NotOriginal'] = 'True'
    # Update info about bcl2fastq tool
    if not parser.obj.get('DemultiplexConfig'):
        parser.obj['DemultiplexConfig'] = {
            'Setup': {
                'Software': run.CONFIG.get('bcl2fastq', {})
            }
        }
    statusdb.update_doc(db, parser.obj, over_write_db_entry=True)
Esempio n. 4
0
 def _log_pdc_statusdb(self, run):
     """Log the time stamp in statusDB if a file is succussfully sent to PDC."""
     try:
         run_vals = run.split('_')
         run_fc = '{}_{}'.format(run_vals[0], run_vals[-1])
         couch_connection = statusdb.StatusdbSession(self.couch_info).connection
         db = couch_connection[self.couch_info['db']]
         fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)}
         d_id = fc_names[run_fc]
         doc = db.get(d_id)
         doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         db.save(doc)
         logger.info('Logged "pdc_archived" timestamp for fc {} in statusdb doc "{}"'.format(run, d_id))
     except:
         logger.warn('Not able to log "pdc_archived" timestamp for run {}'.format(run))
Esempio n. 5
0
def fail_run(runid, project):
    """Updates status of specified run or project-run to Failed."""
    statusdb_conf = CONFIG.get('statusdb')
    logger.info('Connecting to status db: {}:{}'.format(
        statusdb_conf.get('url'), statusdb_conf.get('port')))
    try:
        status_db = statusdb.StatusdbSession(statusdb_conf).connection
    except Exception as e:
        logger.error(
            'Can not connect to status_db: http://{}:*****@{}:{}'.format(
                statusdb_conf.get('username'), statusdb_conf.get('url'),
                statusdb_conf.get('port')))
        logger.error(e)
        raise e
    bioinfo_db = status_db['bioinfo_analysis']
    if project is not None:
        view = bioinfo_db.view('full_doc/pj_run_to_doc')
        rows = view[[project, runid]].rows
        logger.info(
            'Updating status of {} objects with flowcell_id: {} and project_id {}'
            .format(len(rows), runid, project))
    else:
        view = bioinfo_db.view('full_doc/run_id_to_doc')
        rows = view[[runid]].rows
        logger.info(
            'Updating status of {} objects with flowcell_id: {}'.format(
                len(rows), runid))

    new_timestamp = datetime.datetime.now().isoformat()
    updated = 0
    for row in rows:
        if row.value['status'] != 'Failed':
            row.value['values'][new_timestamp] = {
                'sample_status': 'Failed',
                'user': '******'
            }
            row.value['status'] = 'Failed'
        try:
            bioinfo_db.save(row.value)
            updated += 1
        except Exception as e:
            logger.error(
                'Cannot update object project-sample-run-lane: {}-{}-{}-{}'.
                format(row.value.get('project_id'), row.value.get('sample'),
                       row.value.get('run_id'), row.value.get('lane')))
            logger.error(e)
            raise e
    logger.info('Successfully updated {} objects'.format(updated))
Esempio n. 6
0
def update_cronjob_db():
    server = platform.node().split('.')[0]
    timestamp = datetime.datetime.now()
    # parse results
    result = _parse_crontab()
    # connect to db
    statusdb_conf = CONFIG.get('statusdb')
    logging.info('Connecting to database: {}'.format(
        CONFIG.get('statusdb', {}).get('url')))
    try:
        couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
    except Exception as e:
        logging.error(e.message)
    else:
        # update document
        crontab_db = couch_connection['cronjobs']
        view = crontab_db.view('server/alias')
        # to be safe
        doc = {}
        # create doc if not exist
        if not view[server].rows:
            logging.info('Creating a document')
            doc = {
                'users': {user: cronjobs
                          for user, cronjobs in result.items()},
                'Last updated': str(timestamp),
                'server': server,
            }
        # else: get existing doc
        for row in view[server]:
            logging.info('Updating the document')
            doc = crontab_db.get(row.value)
            doc['users'].update(result)
            doc['Last updated'] = str(timestamp)
        if doc:
            try:
                crontab_db.save(doc)
            except Exception as e:
                logging.error(e.message)
            else:
                logging.info('{} has been successfully updated'.format(server))
        else:
            logging.warning('Document has not been created/updated')
Esempio n. 7
0
def create(projects, ngi_config_file, fastq_1, fastq_2):
    statusdb_conf = CONFIG.get('statusdb')
    if statusdb_conf is None:
        logger.error('No statusdb field in taca configuration file')
        return 1
    if 'dev' not in statusdb_conf['url']:
        logger.error('url for status db is {}, but dev must be specified in this case'.format(statusdb_conf['url']))
    couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
    projectsDB = couch_connection['projects']
    project_summary = projectsDB.view('project/summary')
    projects_closed_more_than_three_months = {}
    projects_closed_more_than_one_month_less_than_three = {}
    projects_closed_less_than_one_month = {}
    projects_opened = {}
    current_date = datetime.datetime.today()
    date_limit_one_year = current_date - relativedelta(months=6) #yes yes I know.. but in this way i am sure all data in in xflocell_db
    date_limit_one_month = current_date - relativedelta(months=1)
    date_limit_three_month = current_date - relativedelta(months=3)
    for row in project_summary:
        project_id = row['key'][1]
        project_status = row['key'][0]
        if 'application' not in row['value']:
            continue
        if row['value']['no_samples'] > 50:
            continue # Skip large projects
        application = row['value']['application']
        if project_status == 'closed':
            if 'close_date' in row['value']:
                close_date = datetime.datetime.strptime(row['value']['close_date'], '%Y-%m-%d')
                if close_date > date_limit_one_year: # If the project has been closed after the date limit
                    if close_date >= date_limit_one_month:
                        projects_closed_less_than_one_month[project_id] = {'project_name': row['value']['project_name'],
                                                                           'application': application,
                                                                           'no_samples': row['value']['no_samples']}
                    elif close_date < date_limit_one_month and close_date >= date_limit_three_month:
                        projects_closed_more_than_one_month_less_than_three[project_id] = {'project_name': row['value']['project_name'],
                                                                                           'application': application,
                                                                                           'no_samples': row['value']['no_samples']}
                    elif close_date < date_limit_three_month:
                        projects_closed_more_than_three_months[project_id] = {'project_name': row['value']['project_name'],
                                                                              'application': application,
                                                                              'no_samples': row['value']['no_samples']}
        elif project_status == 'open':
            if 'lanes_sequenced' in row['value'] and row['value']['lanes_sequenced'] > 0:
                projects_opened[project_id] =  {'project_name': row['value']['project_name'],
                                                'application': application,
                                                'no_samples': row['value']['no_samples']}
        else:
            print('status {}'.format(project_status))
    ## Now I can parse the x_flowcell db to check what I can and cannot use
    whole_genome_projects = int(2*projects/3)
    projects_to_reproduce = []
    select_random_projects(projects_closed_more_than_three_months,
                           whole_genome_projects/4+1,
                           'WG re-seq',
                           projects_to_reproduce,
                           'WGreseq_tot_closed')
    select_random_projects(projects_closed_more_than_one_month_less_than_three,
                           whole_genome_projects/4+1,
                           'WG re-seq',
                           projects_to_reproduce,
                           'WGreseq_closed_clean_no_del')
    select_random_projects(projects_closed_less_than_one_month,
                           whole_genome_projects/4+1,
                           'WG re-seq',
                           projects_to_reproduce,
                           'WGreseq_closed_no_clean')
    select_random_projects(projects_opened,
                           whole_genome_projects/4+1,
                           'WG re-seq',
                           projects_to_reproduce,
                           'WGreseq_open')

    other_projects = int(projects/3)
    select_random_projects(projects_closed_more_than_three_months,
                           other_projects/4+1,
                           'other',
                           projects_to_reproduce,
                           'noWGreseq_tot_closed')
    select_random_projects(projects_closed_more_than_one_month_less_than_three,
                           other_projects/4+1,
                           'other',
                           projects_to_reproduce,
                           'noWGreseq_closed_clean_no_del')
    select_random_projects(projects_closed_less_than_one_month,
                           other_projects/4+1,
                           'other',
                           projects_to_reproduce,
                           'noWGreseq_closed_no_clean')
    select_random_projects(projects_opened,
                           other_projects/4+1,
                           'other',
                           projects_to_reproduce,
                           'noWGreseq_open')

    # Create ngi_pipeline enviroment
    print('#NGI_CONFIG varaible is {}. This variable needs to be in the .bashrc file'.format(ngi_config_file))
    print('NGI_CONFIG={}'.format(ngi_config_file))
    try:
        ngi_config = conf.load_config(ngi_config_file)
    except IOError as e:
        print('ERROR: {}'.format(e.message))
    # Create uppmax env
    paths = create_uppmax_env(ngi_config)

    print('#Going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry'.format(len(projects_to_reproduce)))
    # Scan over x_flowcell and reproduce FCs
    flowcellDB = couch_connection['x_flowcells']
    reproduced_projects = {}
    for fc_doc in flowcellDB:
        try:
            samplesheet_csv = flowcellDB[fc_doc]['samplesheet_csv']
        except KeyError:
            continue # Parse only FC that have a samplesheet
        # Check if this FC contains one of the proejcts I need to replicate.
        projects_in_FC = set()
        if 'SampleName' in samplesheet_csv[0]:
            projects_in_FC = set([line['SampleName'].split('_')[0] for line in samplesheet_csv])
        else:
            projects_in_FC = set([line['Sample_Name'].split('_')[0] for line in samplesheet_csv])
        found = False
        for project_pair in projects_to_reproduce:
            project = project_pair[0]
            if project in projects_in_FC:
                # This FC needs to be created
                if not found:
                    # Create the FC only the first time I see a project belonging to it
                    create_FC(paths['flowcell_inbox'] , flowcellDB[fc_doc]['RunInfo']['Id'], samplesheet_csv, fastq_1, fastq_2)
                    found = True
                # But I keep track of all projects-run I need to organise
                if project not in reproduced_projects:
                    reproduced_projects[project] = []
                reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id'])
    print('#Reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)'.format(len(reproduced_projects)))
    for project in projects_to_reproduce:
        if project[0] in reproduced_projects:
            print('#  {}: {}'.format(project[0], project[1]))
    # Need to output the command to organise
    to_be_deleted = []
    for project in reproduced_projects:
        for FC in reproduced_projects[project]:
            print('Running: ngi_pipeline_start.py organize flowcell {} -p {}'.format(FC, project))
            with open('ngi_pipeline_local.logs', 'w') as NGILOGS:
                return_value = subprocess.call(['ngi_pipeline_start.py',
                                                'organize',
                                                'flowcell',
                                                '{}'.format(FC),
                                                '-p',
                                                '{}'.format(project)],
                                               stdout=NGILOGS, stderr=NGILOGS)
            if return_value > 0:
                print('#project {} not organised: have a look to the logs, but most likely this projec is not in charon'.format(project))
                if project not in to_be_deleted:
                    to_be_deleted.append(project)

    for project in to_be_deleted:
        del reproduced_projects[project]

    # Create ANALYSIS --
    for project in projects_to_reproduce:
        if project[0] in reproduced_projects: # Only for projects that I know I have organised
            produce_analysis_qc_ngi(ngi_config, project[0])
            if project[1].startswith('WGreseq'):
                produce_analysis_piper(ngi_config, project[0])

    # Store in a file the results
    with open('projects.txt', 'w') as PROJECTS:
        for project in projects_to_reproduce:
            if project[0] in reproduced_projects:
                PROJECTS.write(u'{}:{}\n'.format(project[0], project[1]))
Esempio n. 8
0
def update_statusdb(run_dir):
    """Gets status for a project."""
    # Fetch individual fields
    project_info = get_ss_projects(run_dir)
    run_id = os.path.basename(os.path.abspath(run_dir))
    statusdb_conf = CONFIG.get('statusdb')
    couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
    valueskey = datetime.datetime.now().isoformat()
    db = couch_connection['bioinfo_analysis']
    view = db.view('latest_data/sample_id')
    # Construction and sending of individual records, if samplesheet is incorrectly formatted the loop is skipped
    if not project_info == []:
        for flowcell in project_info:
            for lane in project_info[flowcell]:
                for sample in project_info[flowcell][lane]:
                    for project in project_info[flowcell][lane][sample]:
                        project_info[flowcell][lane][
                            sample].value = get_status(run_dir)
                        sample_status = project_info[flowcell][lane][
                            sample].value
                        obj = {
                            'run_id': run_id,
                            'project_id': project,
                            'flowcell': flowcell,
                            'lane': lane,
                            'sample': sample,
                            'status': sample_status,
                            'values': {
                                valueskey: {
                                    'user': '******',
                                    'sample_status': sample_status
                                }
                            }
                        }
                        # If entry exists, append to existing
                        # Special if case to handle lanes written as int, can be safely removed when old lanes
                        # is no longer stored as int
                        if len(view[[project, run_id,
                                     int(lane), sample]].rows) >= 1:
                            lane = int(lane)
                        if len(view[[project, run_id, lane, sample
                                     ]].rows) >= 1:
                            remote_id = view[[project, run_id, lane,
                                              sample]].rows[0].id
                            lane = str(lane)
                            remote_doc = db[remote_id]['values']
                            remote_status = db[remote_id]['status']
                            # Only updates the listed statuses
                            if remote_status in [
                                    'New', 'ERROR', 'Sequencing',
                                    'Demultiplexing'
                            ] and sample_status != remote_status:
                                # Appends old entry to new. Essentially merges the two
                                for k, v in remote_doc.items():
                                    obj['values'][k] = v
                                logger.info(
                                    'Updating {} {} {} {} {} as {}'.format(
                                        run_id, project, flowcell, lane,
                                        sample, sample_status))
                                #Sorts timestamps
                                obj['values'] = OrderedDict(
                                    sorted(obj['values'].items(),
                                           key=lambda k_v: k_v[0],
                                           reverse=True))
                                #Update record cluster
                                obj['_rev'] = db[remote_id].rev
                                obj['_id'] = remote_id
                                db.save(obj)
                        # Creates new entry
                        else:
                            logger.info('Creating {} {} {} {} {} as {}'.format(
                                run_id, project, flowcell, lane, sample,
                                sample_status))
                            # Creates record
                            db.save(obj)
                        # Sets FC error flag
                        if not project_info[flowcell].value == None:
                            if (('Failed' in project_info[flowcell].value
                                 and 'Failed' not in sample_status)
                                    or ('Failed' in sample_status and 'Failed'
                                        not in project_info[flowcell].value)):
                                project_info[flowcell].value = 'Ambiguous'
                            else:
                                project_info[flowcell].value = sample_status
            # Checks if a flowcell needs partial re-doing
            # Email error per flowcell
            if not project_info[flowcell].value == None:
                if 'Ambiguous' in project_info[flowcell].value:
                    error_emailer('failed_run', run_name)