Ejemplo n.º 1
0
def check_tar_status_and_delete(db, record, days=60, dryrun=False):
    """Check run.tar status
    """
    run_num = record['run']
    if record['deletion'].get('tar') == "locked":
        LOGGER.info("%s tar ball creation in progress", run_num)
        return None
    relative_days = relative_isoformat_time(
        record['deletion'].get('timestamp_tar'))
    if relative_days > days:
        if record['deletion'].get('status') == "locked":
            LOGGER.info("Deletion of %s tar ball in progress", run_num)
            return None
        if dryrun:
            LOGGER.info("Skipping Deletion of %s due to dryrun option",
                        run_num)
            return
        #set deletion.status = locked, update deletion.timestamp
        res = db.update_one({"run": run_num}, {
            "$set": {
                "deletion.status": "locked",
                "deletion.timestamp": generate_timestamp()
            }
        })
        assert res.modified_count == 1, (
            "Modified {} documents instead of 1".format(res.modified_count))
        #delete tar ball
        tar_file = record['deletion'].get('tar')
        assert os.path.exists(
            tar_file), "The run directory {} does not exists".format(tar_file)
        try:
            os.remove(tar_file)
        except OSError as e:
            LOGGER.critical("Error: %s - %s.", e.filename, e.strerror)
        #unset deletion.tar and deletion.timestamp_tar
        res = db.update_one(
            {"run": run_num},
            {"$unset": {
                "deletion.tar": "",
                "deletion.timestamp_tar": ""
            }})
        assert res.modified_count == 1, (
            "Modified {} documents instead of 1".format(res.modified_count))
        #set deletion.status = deleted, update deletion.timestamp
        res = db.update_one({"run": run_num}, {
            "$set": {
                "deletion.status": "deleted",
                "deletion.timestamp": generate_timestamp()
            }
        })
        assert res.modified_count == 1, (
            "Modified {} documents instead of 1".format(res.modified_count))
        LOGGER.info("Deleted the tar ball for %s ", run_num)
        return True
Ejemplo n.º 2
0
 def put_file_into_database(self, name, file):
     if self.__fs.exists({"filename": name}):
         fileToDeleteID = self.__fs.find_one({"filename": name})._id
         self.__fs.delete(fileToDeleteID)
         self.__fs.put(file,
                       filename=name,
                       uploadDateCET=generate_timestamp())
     else:
         self.__fs.put(file,
                       filename=name,
                       uploadDateCET=generate_timestamp())
Ejemplo n.º 3
0
def create_run_tar(db, run_num):
    """compress bcl directory into a tar ball
    """
    #Set deletion.tar update timestamp
    res = db.update_one({"run": run_num}, {
        "$set": {
            "deletion.tar": "locked",
            "deletion.timestamp_tar": generate_timestamp()
        }
    })
    assert res.modified_count == 1, (
        "Modified {} documents instead of 1".format(res.modified_count))
    #Create tar ball and md5sum
    rundir = get_bcl_runfolder_for_runid(run_num)
    assert os.path.isdir(
        rundir), "The run directory {} does not exists".format(rundir)
    run_tar = "/mnt/projects/userrig/BENCHMARK_testing/test/" + run_num + ".tar"
    LOGGER.info("compression started %s ", run_tar)
    with tarfile.open(run_tar, "x") as tar:
        tar.add(rundir)
    md5sum_cmd = 'md5sum %s' % (run_tar)
    dest_md5sum = "/mnt/projects/userrig/BENCHMARK_testing/test/" + run_num + ".md5sum"
    assert os.path.exists(run_tar), "Tar ball {} does not exists".format(
        run_tar)
    try:
        f = open(os.path.join(dest_md5sum), "w")
        _ = subprocess.call(md5sum_cmd,
                            shell=True,
                            stderr=subprocess.STDOUT,
                            stdout=f)
        LOGGER.info("compression completed %s ", run_num)
        #Delete bcl directory ## FIXME finally
        #shutil.rmtree(rundir)
    except (subprocess.CalledProcessError, OSError) as e:
        LOGGER.fatal("The following command failed with return code %s: %s",
                     e.returncode, ' '.join(md5sum_cmd))
        LOGGER.fatal("Output: %s", e.output.decode())
        LOGGER.fatal("Exiting")
        sys.exit(1)
    LOGGER.info("Deletion of bcl directory completed for %s ", run_num)
    #set deletion.tar = filename, update deletion.timestamp
    res = db.update_one({"run": run_num}, {
        "$set": {
            "deletion.tar": run_tar,
            "deletion.timestamp_tar": generate_timestamp()
        }
    })
    assert res.modified_count == 1, (
        "Modified {} documents instead of 1".format(res.modified_count))
Ejemplo n.º 4
0
    def __init__(self, project_name, username, config_path):
        """
        The constructor initializes the config file, the project
        and the user name.
        :param project_name: Project name to be assigned
        :param username: Username to be attached to this project.
        This will later on be used for security purposes
        :param config_path: Path of the config
        """
        config_parser = ConfigHandler(config_path, project_name=project_name)
        self.compute_config = config_parser.get_compute_config()
        self.storage_config = config_parser.get_storage_config()
        self.queue_config = config_parser.get_queue_config()
        self.master_node_config = config_parser.get_master_node_config()
        self.compute_ports = config_parser.get_ports()

        self.project_name = project_name
        self.username = username
        self.experiment_id = generate_timestamp("experiment")
        logger.info(f"Experiment ID: {self.experiment_id}\n")

        self.experiment_dir = self.initialize_folders()
        self._initialize_bucket_structure()
        self.initialize_queue()

        self._create_completion_submission_docker_compose()

        self.completion_service_process = \
            self.initialize_completion_service()

        self.compute_managers = {}
        self.create_instances()
Ejemplo n.º 5
0
    def from_consumer_and_token(
        oauth_consumer,
        token=None,
        callback=None,
        verifier=None,
        http_method=HTTP_METHOD,
        http_url=None,
        parameters=None,
    ):
        if not parameters:
            parameters = {}

        defaults = {
            "oauth_consumer_key": oauth_consumer.key,
            "oauth_timestamp": generate_timestamp(),
            "oauth_nonce": generate_nonce(),
            "oauth_version": OAuthRequest.version,
        }

        defaults.update(parameters)
        parameters = defaults

        if token:
            parameters["oauth_token"] = token.key
            if token.callback:
                parameters["oauth_callback"] = token.callback
            # 1.0a support for verifier.
            if verifier:
                parameters["oauth_verifier"] = verifier
        elif callback:
            # 1.0a support for callback in the request token request.
            parameters["oauth_callback"] = callback

        return OAuthRequest(http_method, http_url, parameters)
Ejemplo n.º 6
0
    async def send_speech_config_msg(self):
        # assemble the payload for the speech.config message
        context = {
            'system': {
                'version': '5.4'
            },
            'os': {
                'platform': platform.system(),
                'name': platform.system() + ' ' + platform.version(),
                'version': platform.version()
            },
            'device': {
                'manufacturer': 'SpeechSample',
                'model': 'SpeechSample',
                'version': '1.0.00000'
            }
        }
        payload = {'context': context}

        # assemble the header for the speech.config message
        msg = 'Path: speech.config\r\n'
        msg += 'Content-Type: application/json; charset=utf-8\r\n'
        msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n'
        # append the body of the message
        msg += '\r\n' + json.dumps(payload, indent=2)

        # DEBUG PRINT
        # print('>>', msg)

        await self.ws.send(msg)
Ejemplo n.º 7
0
    async def send_audio_msg(self, audio_file_path):
        # open the binary audio file
        with open(audio_file_path, 'rb') as f_audio:
            num_chunks = 0
            while True:
                # read the audio file in small consecutive chunks
                audio_chunk = f_audio.read(self.chunk_size)
                if not audio_chunk:
                    break
                num_chunks += 1

                # assemble the header for the binary audio message
                msg = b'Path: audio\r\n'
                msg += b'Content-Type: audio/x-wav\r\n'
                msg += b'X-RequestId: ' + bytearray(self.request_id,
                                                    'ascii') + b'\r\n'
                msg += b'X-Timestamp: ' + bytearray(utils.generate_timestamp(),
                                                    'ascii') + b'\r\n'
                # prepend the length of the header in 2-byte big-endian format
                msg = len(msg).to_bytes(2, byteorder='big') + msg
                # append the body of the message
                msg += b'\r\n' + audio_chunk

                # DEBUG PRINT
                # print('>>', msg)
                # sys.stdout.flush()

                try:
                    await self.ws.send(msg)
                    # DEBUG CONCURRENCY
                    # await asyncio.sleep(0.1)
                except websockets.exceptions.ConnectionClosed as e:
                    print('Connection closed: {0}'.format(e))
                    return
Ejemplo n.º 8
0
 def recordTelemetry(self, response_path):
     if response_path not in [
             next(iter(msg.keys())) for msg in self.received_messages
     ]:
         self.received_messages.append(
             {response_path: utils.generate_timestamp()})
     else:
         for i, msg in enumerate(self.received_messages):
             if next(iter(msg.keys())) == response_path:
                 if not isinstance(msg[response_path], list):
                     self.received_messages[i][response_path] = [
                         msg[response_path]
                     ]
                 self.received_messages[i][response_path].append(
                     utils.generate_timestamp())
                 break
Ejemplo n.º 9
0
def purge(db, runid_and_flowcellid, mail_to):
    """
    purging bcl data from /mnt/seq/novogene
    """
    rundir = get_bcl_runfolder_for_runid(runid_and_flowcellid)
    if not os.path.exists(rundir):
        LOGGER.critical("Run directory '%s' does not exist.\n", rundir)
        return
    # Sanity checks for Sequencing run
    assert os.path.exists(os.path.join(rundir, 'RunInfo.xml')), \
        "No RunInfo.xml found under {}".format(rundir)
    stat_info = os.stat(rundir)
    #Check if uid is novogene (925)
    assert stat_info.st_uid == 925, "The run {} does not belong to Novogene user".format(
        rundir)
    try:
        start_time = generate_timestamp()
        res = db.update_one({"run": runid_and_flowcellid}, \
                            {"$set": \
                                {"raw-delete": { \
                                    "start_time" : start_time, \
                                    "Status" :  "STARTED", \
                            }}})
        assert res.modified_count == 1, ("Modified {} documents instead of 1". \
            format(res.modified_count))
        #FIXME for production release
        #shutil.rmtree(rundir)
        end_time = generate_timestamp()
        res = db.update_one({"run": runid_and_flowcellid},
                            {"$set": {"raw-delete.Status": "SUCCESS", \
                                "raw-delete.end_time": end_time}})
        assert res.modified_count == 1, ("Modified {} documents instead of 1". \
            format(res.modified_count))
        subject = "bcl deletion: {}".format(runid_and_flowcellid)
        body = "Bcl deletion completed successfully from {}".format(rundir)
        send_mail(subject, body, toaddr=mail_to)
    except OSError:
        LOGGER.critical("Error happened while deleting '%s'", rundir)
        res = db.update_one({"run": runid_and_flowcellid}, \
                            {"$unset": {"raw-delete": ""}})
        assert res.modified_count == 1, ("Modified {} documents instead of 1". \
            format(res.modified_count))
        subject = "Error: bcl deletion {}".format(runid_and_flowcellid)
        body = "Error happened while deleting raw data under {}".format(rundir)
        send_mail(subject, body, toaddr=mail_to)
Ejemplo n.º 10
0
 def __record_telemetry(self, response_path):
     # if a single message of a certain type, store the value directly
     if response_path not in [
             next(iter(msg.keys())) for msg in self.received_messages
     ]:
         self.received_messages.append(
             {response_path: utils.generate_timestamp()})
     # if multiple messages of a certain type, store the values in a list
     else:
         for i, msg in enumerate(self.received_messages):
             if next(iter(msg.keys())) == response_path:
                 if not isinstance(msg[response_path], list):
                     self.received_messages[i][response_path] = [
                         msg[response_path]
                     ]
                 self.received_messages[i][response_path].append(
                     utils.generate_timestamp())
                 break
Ejemplo n.º 11
0
def relative_isoformat_time(last_analysis):
    """
    Relative isoformat_time
    """
    analysis_epoch_time = isoformat_to_epoch_time(last_analysis+"+08:00")
    epoch_time_now = isoformat_to_epoch_time(generate_timestamp()+"+08:00")
    rd = relative_epoch_time(epoch_time_now, analysis_epoch_time)
    relative_days = rd.months*30 + rd.days
    return relative_days
Ejemplo n.º 12
0
    async def connect_to_speech_api(self, language, response_format,
                                    recognition_mode):
        self.language = language
        self.response_format = response_format
        self.recognition_mode = recognition_mode

        # determine the endpoint based on the selected recognition mode
        endpoint = self.__get_cur_endpoint()
        if endpoint is None:
            print('Error: invalid recognition mode.')
            return

        # assemble the URL and the headers for the connection request
        url = endpoint + '?language={0}&format={1}'.format(
            self.language, self.response_format)
        headers = {
            'Authorization': 'Bearer ' + self.auth_token,
            'X-ConnectionId': self.connection_id
        }

        # record the Connection metric telemetry data
        self.metrics.append({
            'Name': 'Connection',
            'Id': self.connection_id,
            'Start': utils.generate_timestamp()
        })

        try:
            # request a WebSocket connection to the speech API
            print(endpoint)
            print(headers)
            self.ws = await websockets.client.connect(url,
                                                      extra_headers=headers)
        except websockets.exceptions.InvalidHandshake as err:
            print('Handshake error: {0}'.format(err))
            return
        # TODO: add Connection failure telemetry for error cases

        # record the Connection metric telemetry data
        self.metrics[-1]['End'] = utils.generate_timestamp()

        # send the speech.config message
        await self.send_speech_config_msg()
Ejemplo n.º 13
0
    async def connectAPI(self):
        endpoint = endpoints_ws[self.recognition_mode]
        url = endpoint
        headers = {
            'Authorization': 'Bearer ' + self.auth_token,
            'X-ConnectionId': self.connection_id
        }

        self.metrics.append({
            'Name': 'Connection',
            'Id': self.connection_id,
            'Start': utils.generate_timestamp()
        })

        try:
            self.ws = await websockets.client.connect(url,
                                                      extra_headers=headers)
        except websockets.exceptions.InvalidHandshake as err:
            print('Handshake error: {0}'.format(err))
            return

        self.metrics[-1]['End'] = utils.generate_timestamp()
        await self.sendSpeechConfig()
Ejemplo n.º 14
0
    def write(self, dirname, dbid, timestamp=None):
        """Write starter flag file
        """

        if not timestamp:
            timestamp = generate_timestamp()
        self.timestamp = timestamp
        self.dbid = dbid
        self.filename = os.path.join(
            dirname, self.pattern.format(timestamp=self.timestamp))

        assert not os.path.exists(self.filename), (
            "StartFlag {} already exists".format(self.filename))
        with open(self.filename, 'w') as fh:
            fh.write(dbid)
Ejemplo n.º 15
0
def get_downstream_outdir(requestor, pipeline_name, pipeline_version=None):
    """generate downstream output directory
    """

    if is_devel_version():
        basedir = site_cfg['downstream_outdir_base']['devel']
    else:
        basedir = site_cfg['downstream_outdir_base']['production']
    if pipeline_version:
        pversion = pipeline_version
    else:
        pversion = get_pipeline_version(nospace=True)
    outdir = DOWNSTREAM_OUTDIR_TEMPLATE.format(
        basedir=basedir, user=requestor, pipelineversion=pversion,
        pipelinename=pipeline_name, timestamp=generate_timestamp())
    return outdir
Ejemplo n.º 16
0
    async def sendTelemetry(self, is_first_turn=False):
        payload = {'ReceivedMessages': self.received_messages}
        if is_first_turn:
            payload['Metrics'] = self.metrics

        msg = 'Path: telemetry\r\n'
        msg += 'Content-Type: application/json; charset=utf-8\r\n'
        msg += 'X-RequestId: ' + self.request_id + '\r\n'
        msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n'
        msg += '\r\n' + json.dumps(payload, indent=2)

        try:
            await self.ws.send(msg)
        except websockets.exceptions.ConnectionClosed as e:
            print('Connection closed: {0}'.format(e))
            return
Ejemplo n.º 17
0
def bundle_and_clean_logs(pipeline_outdir,
                          result_outdir="out/",
                          log_dir="logs/",
                          overwrite=False):
    """bundle log files in pipeline_outdir+result_outdir and
    pipeline_outdir+log_dir to pipeline_outdir+logs.tar.gz and remove

    See http://stackoverflow.com/questions/40602894/access-to-log-files for potential alternatives
    """

    for d in [
            pipeline_outdir,
            os.path.join(pipeline_outdir, result_outdir),
            os.path.join(pipeline_outdir, log_dir)
    ]:
        if not os.path.exists(d):
            logger.warning("Missing directory %s. Skipping log bundling.", d)
            return

    bundle = os.path.join(log_dir,
                          "logs.tar.gz")  # relative to pipeline_outdir
    if not overwrite and os.path.exists(os.path.join(pipeline_outdir, bundle)):
        bundle = os.path.join(log_dir,
                              "logs.{}.tar.gz".format(generate_timestamp()))
        assert not os.path.exists(os.path.join(pipeline_outdir, bundle))

    orig_dir = os.getcwd()
    os.chdir(pipeline_outdir)
    # all log files associated with output files
    logfiles = glob.glob(os.path.join(result_outdir, "**/*.log"),
                         recursive=True)
    # (cluster) log directory
    logfiles.extend(glob.glob(os.path.join(log_dir, "*")))
    # paranoid cleaning and some exclusion
    logfiles = [
        f for f in logfiles
        if os.path.isfile(f) and not f.endswith("snakemake.log")
    ]

    with tarfile.open(bundle, "w:gz") as tarfh:
        for f in logfiles:
            tarfh.add(f)
            os.unlink(f)

    os.chdir(orig_dir)
Ejemplo n.º 18
0
    def __init__(
            self,
            script_name,  # used as logging prefix. can be dummy
            pipeline_name,
            pipeline_version,
            submitter,
            site,
            instance_id,
            log_path,  # main logging file
            elm_units):
        """FIXME:add-doc"""

        assert isinstance(elm_units, list)

        elmlogdir = os.getenv('RPD_ELMLOGDIR')
        assert elmlogdir, ("RPD_ELMLOGDIR undefined")

        pipelogdir = os.path.join(elmlogdir, pipeline_name)
        assert os.path.exists(pipelogdir), (
            "pipeline log dir {} doesn't exist".format(pipelogdir))

        # timestamp just a way to make it unique
        logfile = os.path.join(pipelogdir, generate_timestamp() + ".log")
        assert not os.path.exists(logfile)
        self.logfile = logfile

        # only used as logging prefix (not even parsed by ELM)
        self.script_name = script_name

        # json-like values
        #self.fields = OrderedDict()
        self.fields = dict()
        # caller provided
        self.fields['pipeline_name'] = pipeline_name
        self.fields['pipeline_version'] = pipeline_version
        self.fields['site'] = site
        self.fields['instance_id'] = instance_id
        self.fields['submitter'] = submitter
        self.fields['log_path'] = log_path
        # internally computed
        self.fields['status_id'] = None

        self.elm_units = elm_units
Ejemplo n.º 19
0
def get_bcl2fastq_outdir(runid_and_flowcellid):
    """where to write bcl2fastq output to
    """

    if is_devel_version():
        basedir = site_cfg['bcl2fastq_outdir_base']['devel']
    else:
        basedir = site_cfg['bcl2fastq_outdir_base']['production']

    machineid, runid, flowcellid = get_machine_run_flowcell_id(
        runid_and_flowcellid)

    outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format(
        basedir=basedir,
        mid=machineid,
        rid=runid,
        fid=flowcellid,
        ts=generate_timestamp())
    return outdir
Ejemplo n.º 20
0
def update_run_status(mongo_status_script, run_num, outdir, status, testing):
    """Update run status in the mongoDB
    """
    logger.info("Setting analysis for %s to %s", run_num, status)
    analysis_id = generate_timestamp()
    mongo_update_cmd = [mongo_status_script, "-r", run_num, "-s", status]
    mongo_update_cmd.extend(["-a", analysis_id, "-o", outdir])
    if testing:
        mongo_update_cmd.append("-t")
    try:
        _ = subprocess.check_output(mongo_update_cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logger.fatal("The following command failed with return code %s: %s",
                     e.returncode, ' '.join(mongo_update_cmd))
        logger.fatal("Output: %s", e.output.decode())
        logger.fatal("Exiting")
        sys.exit(1)

    flagfile = os.path.join(outdir, "SEQRUNFAILED")
    logger.info("Creating flag file %s", flagfile)
    with open(flagfile, 'w') as _:
        pass
Ejemplo n.º 21
0
    async def sendAudio(self, audio_file_path):
        with open(audio_file_path, 'rb') as f_audio:
            num_chunks = 0
            while True:
                audio_chunk = f_audio.read(self.chunk_size)
                if not audio_chunk:
                    break
                num_chunks += 1

                msg = b'Path: audio\r\n'
                msg += b'Content-Type: audio/x-wav\r\n'
                msg += b'X-RequestId: ' + bytearray(self.request_id,
                                                    'ascii') + b'\r\n'
                msg += b'X-Timestamp: ' + bytearray(utils.generate_timestamp(),
                                                    'ascii') + b'\r\n'
                msg = len(msg).to_bytes(2, byteorder='big') + msg
                msg += b'\r\n' + audio_chunk

                try:
                    await self.ws.send(msg)
                except websockets.exceptions.ConnectionClosed as e:
                    print('Connection closed: {0}'.format(e))
                    return
Ejemplo n.º 22
0
    async def send_telemetry_msg(self, is_first_turn=False):
        # assemble the payload for the telemetry message
        payload = {'ReceivedMessages': self.received_messages}
        if is_first_turn:
            payload['Metrics'] = self.metrics

        # assemble the header for the speech.config message
        msg = 'Path: telemetry\r\n'
        msg += 'Content-Type: application/json; charset=utf-8\r\n'
        msg += 'X-RequestId: ' + self.request_id + '\r\n'
        msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n'
        # append the body of the message
        msg += '\r\n' + json.dumps(payload, indent=2)

        # DEBUG PRINT
        # print('>>', msg)
        # sys.stdout.flush()

        try:
            await self.ws.send(msg)
        except websockets.exceptions.ConnectionClosed as e:
            print('Connection closed: {0}'.format(e))
            return
Ejemplo n.º 23
0
def runs_from_db(db, days=75, win=34):
    """Get the runs from pipeline_run collections"""
    epoch_present, epoch_back = generate_window(win)
    results = db.find({
        "run": {
            "$regex": "^NG00"
        },
        "raw-delete": {
            "$exists": False
        },
        "timestamp": {
            "$gt": epoch_back,
            "$lt": epoch_present
        }
    })
    LOGGER.info("Found %d runs for last %s days", results.count(), win)
    for record in results:
        LOGGER.debug("record: %s", record)
        if not record.get('run'):
            LOGGER.critical("run is missing for DB-id %s", record['_id'])
            continue
        runid_and_flowcellid = (record['run'])
        results = db.find({"run": runid_and_flowcellid})
        if not 'analysis' in record:
            continue
        last_analysis = record['analysis'][-1]
        status = last_analysis.get("Status")
        end_time = last_analysis.get("end_time")
        if not status or not end_time:
            continue
        analysis_epoch_time = isoformat_to_epoch_time(end_time + "+08:00")
        epoch_time_now = isoformat_to_epoch_time(generate_timestamp() +
                                                 "+08:00")
        rd = relative_epoch_time(epoch_time_now, analysis_epoch_time)
        relative_days = rd.months * 30 + rd.days
        if status == 'SUCCESS' and relative_days > days:
            yield runid_and_flowcellid
Ejemplo n.º 24
0
    async def sendSpeechConfig(self):
        context = {
            'system': {
                'version': '5.4'
            },
            'os': {
                'platform': platform.system(),
                'name': platform.system() + ' ' + platform.version(),
                'version': platform.version()
            },
            'device': {
                'manufacturer': 'SpeechSample',
                'model': 'SpeechSample',
                'version': '1.0.00000'
            }
        }
        payload = {'context': context}

        msg = 'Path: speech.config\r\n'
        msg += 'Content-Type: application/json; charset=utf-8\r\n'
        msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n'
        msg += '\r\n' + json.dumps(payload, indent=2)

        await self.ws.send(msg)
Ejemplo n.º 25
0
parser.add_argument("-m",
                    "--model",
                    help="path to word2vec/model/timestamp.model")
parser.add_argument("-p",
                    "--project",
                    help="path to validation_set.tsv and submissions")
args = parser.parse_args()

# Load model
model = LsiModel.load(args.model, mmap='r')

# Load validation set and advance 1 line
validation_set = open("%s/validation_set.tsv" % args.project)
validation_set.readline()

output = open("%s/%s_submission.csv" % (args.project, generate_timestamp()),
              "w")
output.write("id,correctAnswer\n")

for line in validation_set:
    elements = line.split("\t")
    question_id = elements.pop(0)

    # Get bag-of-words representation of question and answers
    doc_vectors = [
        model.id2word.doc2bow(element.split()) for element in elements
    ]
    question = doc_vectors.pop(0)

    # Generate list of tuples:
    # (Cosine similarity, mapped index 0-3 to A-D)
Ejemplo n.º 26
0
def mark_as_completed():
    """Dropping a flag file marking analysis as complete"""
    analysis_dir = os.getcwd()
    flag_file = os.path.join(analysis_dir, WORKFLOW_COMPLETION_FLAGFILE)
    with open(flag_file, 'a') as fh:
        fh.write("{}\n".format(generate_timestamp()))
Ejemplo n.º 27
0
    def run_inference_engine(self,
                             model_name: str,
                             model_dir: str,
                             to_csv: bool = False,
                             output_dir: str = None,
                             load_from_s3: bool = False,
                             creds: Dict = None) -> pd.DataFrame:
        """Conducts inference using the test set.

        Arguments:
            model_name {str} -- Name of the trained model.
            model_dir {str} -- Path to where the model is stored.

        Keyword Arguments:
            to_csv {bool} -- Save to csv file (default: {False})
            output_dir {str} -- Path to output directory (default: {None})
            load_from_s3 {bool} -- Load trained model from s3 bucket (default: {False})
            creds {Dict} -- Dictionary containing AWS credentials. Requires
            aws_access_key_id, aws_secret_access_key, bucket. (default: {None})
                E.g.
                CREDENTIALS = {}
                CREDENTIALS['aws_access_key_id'] = os.environ.get("aws_access_key_id")
                CREDENTIALS['aws_secret_access_key'] = os.environ.get("aws_secret_access_key")
                CREDENTIALS['bucket'] = os.environ.get("bucket")

        Returns:
            submission_df {pd.DataFrame} -- A predictions dataframe ready for submission
            to the public leaderboard.
        """
        def _conduct_inference() -> defaultdict:
            predictions = defaultdict(list)
            testing_loaders = self._get_all_testing_loaders()
            for loader in testing_loaders:
                for batch, data in enumerate(loader):
                    image = self.trainer._load_to_gpu_float(data["image"])
                    grapheme, vowel, consonant = self.trainer.model(image)
                    for idx, img_id in enumerate(data["image_id"]):
                        predictions["grapheme"].append(
                            grapheme[idx].cpu().detach().numpy())
                        predictions["vowel"].append(
                            vowel[idx].cpu().detach().numpy())
                        predictions["consonant"].append(
                            consonant[idx].cpu().detach().numpy())
                        predictions["image_id"].append(img_id)

            return predictions

        def _get_maximum_probs(preds: defaultdict) -> Dict:
            return {
                "final_grapheme":
                np.argmax(np.mean(preds["grapheme"], axis=0), axis=1),
                "final_vowel":
                np.argmax(np.mean(preds["vowel"], axis=0), axis=1),
                "final_consonant":
                np.argmax(np.mean(preds["consonant"], axis=0), axis=1),
                "image_ids":
                preds["image_id"]
            }

        def _create_submission_df(pred_dict: Dict) -> pd.DataFrame:
            predictions = []
            for idx, image_id in enumerate(pred_dict["image_ids"]):
                predictions.append((f"{image_id}_grapheme_root",
                                    pred_dict["final_grapheme"][idx]))
                predictions.append((f"{image_id}_vowel_diacritic",
                                    pred_dict["final_vowel"][idx]))
                predictions.append((f"{image_id}_consonant_diacritic",
                                    pred_dict["final_consonant"][idx]))

            return pd.DataFrame(predictions, columns=["row_id", "target"])

        final_predictions = defaultdict(list)
        for idx in range(1, self.params["test_loops"]):
            LOGGER.info(f'Conducting inference for fold {idx}')
            model_name_path = f'{model_name}_bengali_fold{idx}.pth'
            model_state_path = f'{model_dir}/{model_name_path}'
            if load_from_s3:
                self.trainer.load_model_from_s3(filename=model_state_path,
                                                key=model_name_path,
                                                creds=creds)
            self.trainer.load_model_locally(model_path=model_state_path)
            self.trainer.model.to(self.trainer.device)
            self.trainer.model.eval()
            predictions = _conduct_inference()
            final_predictions["grapheme"].append(predictions["grapheme"])
            final_predictions["vowel"].append(predictions["vowel"])
            final_predictions["consonant"].append(predictions["consonant"])
            if idx == 1:
                final_predictions["image_id"].extend(predictions["image_id"])

        pred_dictionary = _get_maximum_probs(preds=final_predictions)
        submission_df = _create_submission_df(pred_dict=pred_dictionary)
        if to_csv:
            timestamp = utils.generate_timestamp()
            output_path = f"{output_dir}/submission_{timestamp}"
            LOGGER.info(f'Saving submission dataframe to {output_path}')
            submission_df.to_csv(output_path, index=False)

        return submission_df
Ejemplo n.º 28
0
  Use LSA to extract latent vectors
"""
import argparse
import bz2
import logging

from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LsiModel

from utils import generate_timestamp

logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)
Ejemplo n.º 29
0
# same as folder name. also used for cluster job names
PIPELINE_NAME = "Mapping"
#CONFIG
CONFIG = "/home/userrig/Solexa/bcl2fastq2-v2.17/"
CONFIG += "generateBCL2FASTQ2.17config.sh"
#SAMPLESHEET
SAMPLESHEET = "/home/userrig/Solexa/bcl2fastq2-v2.17/"
SAMPLESHEET += "generateBCL2FASTQ2.17SampleSheet.sh"
#BWA mapping pipeline
BWA = "/home/userrig/pipelines/NewBwaMappingPipelineMem/"
BWA += "generateBwa0.7.5aconfigurationV217V2.sh"
#RNA mapping pipeline
RNA = "/home/userrig/pipelines/NewRNAseqTophatCufflinksPipeline/"
RNA += "generateTophatCufflinksconfigurationV217V2.sh"
#ANALYSIS_ID
analysis_id = generate_timestamp()
# global logger
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter('[{asctime}] {levelname:8s} {filename} {message}',
                      style='{'))
logger.addHandler(handler)


def main():
    """main function"""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-1',
                        "--break-after-first",
                        action='store_true',
Ejemplo n.º 30
0
def main():
    """main function"""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '-r',
        "--runid",
        help="Run ID plus flowcell ID",
        required=True,
    )
    parser.add_argument(
        '-s',
        "--status",
        help="Analysis status",
        required=True,
        choices=['STARTED', 'SUCCESS', 'FAILED', 'SEQRUNFAILED', 'NON-BCL'])
    parser.add_argument('-a',
                        "--analysis-id",
                        help="Analysis id",
                        required=True)
    parser.add_argument('-o', "--out", help="Analysis output directory")
    parser.add_argument('-t', "--test-server", action='store_true')
    parser.add_argument('-n', "--dry-run", action='store_true', help="Dry run")
    parser.add_argument('-v',
                        '--verbose',
                        action='count',
                        default=0,
                        help="Increase verbosity")
    parser.add_argument('-q',
                        '--quiet',
                        action='count',
                        default=0,
                        help="Decrease verbosity")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if not is_production_user():
        logger.warning("Not a production user. Skipping MongoDB update")
        sys.exit(1)
    user_name = "userrig"

    run_number = args.runid
    connection = mongodb_conn(args.test_server)
    if connection is None:
        sys.exit(1)
    logger.info("Database connection established")
    db = connection.gisds.runcomplete
    logger.debug("DB %s", db)
    logger.info("Status for %s is %s", run_number, args.status)
    if args.status in ["STARTED", "SEQRUNFAILED"]:
        try:
            if not args.dry_run:
                res = db.update_one({"run": run_number}, {
                    "$push": {
                        "analysis": {
                            "analysis_id": args.analysis_id,
                            "user_name": user_name,
                            "out_dir": args.out,
                            "Status": args.status,
                        }
                    }
                })
                assert res.modified_count == 1, (
                    "Modified {} documents instead of 1".format(
                        res.modified_count))
        except (pymongo.errors.OperationFailure, AssertionError) as e:
            logger.fatal(
                "MongoDB update failure while setting run %s analysis_id %s to %s",
                run_number, args.analysis_id, args.status)
            sys.exit(1)

    elif args.status in ["SUCCESS", "FAILED"]:
        end_time = generate_timestamp()
        logger.info("Setting timestamp to %s", end_time)
        try:
            if not args.dry_run:
                res = db.update_one(
                    {
                        "run": run_number,
                        'analysis.analysis_id': args.analysis_id
                    }, {
                        "$set": {
                            "analysis.$": {
                                "analysis_id": args.analysis_id,
                                "end_time": end_time,
                                "user_name": user_name,
                                "out_dir": args.out,
                                "Status": args.status,
                            }
                        }
                    })
                assert res.modified_count == 1, (
                    "Modified {} documents instead of 1".format(
                        res.modified_count))
        except (pymongo.errors.OperationFailure, AssertionError) as e:
            logger.fatal(
                "MongoDB update failure while setting run %s analysis_id %s to %s",
                run_number, args.analysis_id, args.status)
            sys.exit(1)

    else:
        raise ValueError(args.status)

    # close the connection to MongoDB
    connection.close()
Ejemplo n.º 31
0
def start_data_transfer(connection, mux, mux_info, site, mail_to):
    """ Data transfer from source to destination
    """
    run_number, downstream_id, analysis_id, bcl_path = mux_info
    fastq_src = os.path.join(bcl_path, "out", "Project_"+mux)
    bcl_dir = os.path.basename(bcl_path)
    if is_devel_version():
        fastq_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['devel'], \
            mux, run_number, bcl_dir)
        yaml_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['devel'], \
            mux, mux +"_multisample.yaml")
    else:
        fastq_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['production'], \
            mux, run_number, bcl_dir)
        yaml_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['production'], \
            mux, mux+ "_multisample.yaml")
    rsync_cmd = 'rsync -va %s %s' % (fastq_src, fastq_dest)
    if not os.path.exists(fastq_dest):
        try:
            os.makedirs(fastq_dest)
            logger.info("data transfer started for %s from %s", mux, run_number)
            st_time = generate_timestamp()
            update_downstream_mux(connection, run_number, analysis_id, downstream_id, \
                "COPYING_" + st_time)
            _ = subprocess.check_output(rsync_cmd, shell=True, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as e:
            body = "The following command failed with return code {}: {}". \
                format(e.returncode, rsync_cmd)
            subject = "{} from {}: SG10K data transfer ({}) failed".format(mux, run_number, site)
            logger.fatal(body)
            logger.fatal("Output: %s", e.output.decode())
            logger.fatal("Exiting")
            #Send_mail
            send_mail(subject, body, toaddr=mail_to, ccaddr=None)
            #Delete the partial info being rsync
            update_downstream_mux(connection, run_number, analysis_id, downstream_id, "ERROR")
            sys.exit(1)
        #Update the mongoDB for successful data transfer
        sample_info = get_mux_details(run_number, mux, fastq_dest)
        #Touch rsync complete file
        with open(os.path.join(fastq_dest, "rsync_complete.txt"), "w") as f:
            f.write("")
        with open(yaml_dest, 'w') as fh:
            yaml.dump(dict(sample_info), fh, default_flow_style=False)
        job = {}
        job['sample_cfg'] = {}
        for outer_key, outer_value in sample_info.items():
            ctime, _ = generate_window(1)
            job['sample_cfg'].update({outer_key:outer_value})
            job['site'] = site
            job['pipeline_name'] = 'custom/SG10K'
            job['pipeline_version'] = novogene_conf['PIPELINE_VERSION']
            job['ctime'] = ctime
            job['requestor'] = 'userrig'
            if is_devel_version():
                novogene_outdir = os.path.join(novogene_conf['NOVOGENE_OUTDIR'][site]['devel'], \
                    mux)
            else:
                novogene_outdir = os.path.join(novogene_conf['NOVOGENE_OUTDIR'][site]['production'],
                    mux)
            job['out_dir_override'] = novogene_outdir
        logger.info("Data transfer completed successfully for %s from %s", mux, run_number)
        job_id = insert_muxjob(connection, mux, job)
        update_downstream_mux(connection, run_number, analysis_id, downstream_id, job_id)
        subject = "{} from {}: SG10K data transfer ({}) completed".format(mux, run_number, site)
        body = "Data transfer successfully completed for {} from {}".format(mux, run_number)
        send_mail(subject, body, toaddr=mail_to, ccaddr=None)
        return True
    else:
        logger.critical("Mux %s from %s directory already exists under %s", mux, \
            run_number, fastq_dest)
        return False
Ejemplo n.º 32
0
"""
  Generate csv submission for Kaggle contest
"""
import argparse
import logging

from gensim.models import Word2Vec

from utils import extract_elements, choose_answer, preprocess_for_model, generate_timestamp

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-m",
                    "--model",
                    help="path to word2vec/model/timestamp.model")
parser.add_argument("-p",
                    "--project",
                    help="path to validation_set.tsv and submissions")
args = parser.parse_args()

# Load model
model = Word2Vec.load(args.model, mmap='r')

# Load validation set and advance 1 line
validation_set = open("%s/validation_set.tsv" % args.project)
validation_set.readline()

output = open("%s/%s_submission.csv" % (args.project, timestamp), "w")
Ejemplo n.º 33
0
        level=logging.INFO
)

parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model")
parser.add_argument("-p", "--project", help="path to validation_set.tsv and submissions")
args = parser.parse_args()

# Load model
model = LsiModel.load(args.model, mmap='r')

# Load validation set and advance 1 line
validation_set = open("%s/validation_set.tsv" % args.project)
validation_set.readline()

output = open("%s/%s_submission.csv" % (args.project, generate_timestamp()), "w")
output.write("id,correctAnswer\n")

for line in validation_set:
    elements = line.split("\t")
    question_id = elements.pop(0)

    # Get bag-of-words representation of question and answers
    doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements]
    question = doc_vectors.pop(0)

    # Generate list of tuples:
    # (Cosine similarity, mapped index 0-3 to A-D)
    similarities = [(cossim(model[question], model[answer]), chr(idx + 65)) for idx, answer in
                    enumerate(doc_vectors)]
    chosen_answer = max(similarities)[1]