def canonical(self): """compute new, deleted""" canonical_start = Filename.get(self.start, FileType.CANONICAL) canonical_end = Filename.get(self.end, FileType.CANONICAL) canonical_new = Filename.get(self.end, FileType.CANONICAL, FileAdjective.NEW) self.values['new_canonical'] = comm(canonical_end, canonical_start, canonical_new) canonical_deleted = Filename.get(self.end, FileType.CANONICAL, FileAdjective.DELETED) self.values['deleted_canonical'] = comm(canonical_start, canonical_end, canonical_deleted) self.values['canonical'] = lines_in_file(canonical_end)
def fulltext(self): """Compute the new and deleted bibcodes for each type of error from todays list of bibcodes compared with yesterdays list. Results stored in variables that are then used in report.py.""" for e in conf['FULLTEXT_ERRORS'].keys(): err_msg = "_" + ("_".join(e.split())).replace('-', '_') ft_start = Filename.get(self.start, FileType.FULLTEXT, adjective=None, msg=err_msg + "_") ft_end = Filename.get(self.end, FileType.FULLTEXT, adjective=None, msg=err_msg + "_") ft_new = Filename.get(self.end, FileType.FULLTEXT, adjective=FileAdjective.NEW, msg=err_msg + "_") self.values['new_ft' + err_msg] = comm(ft_end, ft_start, ft_new) ft_deleted = Filename.get(self.end, FileType.FULLTEXT, FileAdjective.DELETED, msg=err_msg + "_") self.values['deleted_ft' + err_msg] = comm(ft_start, ft_end, ft_deleted) self.values['ft' + err_msg] = lines_in_file(ft_end)
def canonical(self): """create local copy of canonical bibcodes""" c = conf['CANONICAL_FILE'] air = Filename.get(self.date, FileType.CANONICAL) logger.info( 'making local copy of canonical bibcodes file, from %s to %s', c, air) shutil.copy(c, air) sort(air)
def solr_bibcodes_finish(self, jobid): """get results from earlier submitted job""" url = conf.get('SOLR_URL', 'http://localhost:9983/solr/collection1/') status = 'batch?command=status&wt=json&jobid=' get_results = 'batch?command=get-results&wt=json&jobid=' # now we wait for solr to process batch query finished = False startTime = datetime.now() while not finished: rStatus = requests.get(url + status + jobid) if rStatus.status_code != 200: logger.error('batch status check failed, status: %s, text: %s', rStatus.status_code, rStatus.text) return False j = rStatus.json() if j['job-status'] == 'finished': finished = True else: sleep(10) if (datetime.now() - startTime).total_seconds() > 3600 * 2: logger.error( 'solr batch process taking too long, seconds: %s;', (datetime.now() - startTime).total_seconds()) return False logger.info( 'solr bacth completed in %s seconds, now fetching bibcodes', (datetime.now() - startTime).total_seconds()) rResults = requests.get(url + get_results + jobid) if rResults.status_code != 200: logger.error( 'failed to obtain bibcodes from solr batch query, status: %s, text: %s,', rResults.status_code, rResults.text) return False # finally save bibcodes to file bibs = rResults.text # all 12 million bibcodes are in this one text field # convert to json-ish text to simple string, response includes newlines between bibcodes bibs = re.sub(r'{"bibcode":"|,|"}', '', bibs) filename = Filename.get(self.date, FileType.SOLR) with open(filename, 'w') as f: f.write(bibs) sort(filename) return True
def solr(self): """compute missing, deleted, new, extra""" solr_end = Filename.get(self.end, FileType.SOLR) canonical_end = Filename.get(self.end, FileType.CANONICAL) solr_missing = Filename.get(self.end, FileType.SOLR, FileAdjective.MISSING) self.values['missing_solr'] = comm(canonical_end, solr_end, solr_missing) solr_start = Filename.get(self.start, FileType.SOLR) solr_new = Filename.get(self.end, FileType.SOLR, FileAdjective.NEW) self.values['new_solr'] = comm(solr_end, solr_start, solr_new) solr_deleted = Filename.get(self.end, FileType.SOLR, FileAdjective.DELETED) self.values['deleted_solr'] = comm(solr_start, solr_end, solr_deleted) solr_extra = Filename.get(self.end, FileType.SOLR, FileAdjective.EXTRA) self.values['extra_solr'] = comm(solr_end, canonical_end, solr_extra) self.values['solr'] = lines_in_file(solr_end)
def fulltext(self): """Get errors from todays fulltext logs and generate a list for each type of error of corresponding bibcodes and source directories. These lists are written to files that are further processed in compute.py""" # types of errors with corresponding file names errors = conf['FULLTEXT_ERRORS'] # get todays date now = datetime.strftime(datetime.now(), "%Y-%m-%d") # loop through types of errors messages for err_msg in errors.keys(): bibs = [] dirs = [] # location of bibcode and directory in message field """example log: {"asctime": "2019-08-26T11:38:34.201Z", "msecs": 201.6739845275879, "levelname": "ERROR", "process": 13411, "threadName": "MainThread", "filename": "checker.py", "lineno": 238, "message": "Bibcode '2019arXiv190105463B' is linked to a non-existent file '/some/directory/filename.xml'", "timestamp": "2019-08-26T11:38:34.201Z", "hostname": "adsvm05"}""" loc_bib = 1 loc_dir = 3 if (err_msg == "No such file or directory"): loc_bib = 3 loc_dir = 11 elif (err_msg == "format not currently supported for extraction"): loc_bib = 7 loc_dir = 23 # loop through files for name in glob.glob(errors[err_msg]): command = "awk -F\: '/" + err_msg + "/ && /" + now + "/ && /ERROR/ {print $0}' " + name args = shlex.split(command) x = Popen(args, stdout=PIPE, stderr=STDOUT) # get bibcodes/directories from todays errors resp = x.communicate()[0].split("\n") for r in resp: if r: r = r.split("'") bibs.append(r[loc_bib]) dirs.append(r[loc_dir]) # create filename based on error message and date fname = Filename.get( self.date, FileType.FULLTEXT, adjective=None, msg="_" + ("_".join(err_msg.split())).replace('-', '_') + "_") # write bibcodes and directories for each error type to file with open(fname, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(zip(bibs, dirs)) sort(fname)
import poi from utils import Filename from utils import setup_log from utils import save_model if __name__ == "__main__": mdname = "bpr" fn = Filename("foursquare") setup_log(fn.log(mdname)) train_cks = poi.load_checkins(open(fn.train)) test_cks = poi.load_checkins(open(fn.test)) eva = poi.Evaluation(test_cks, full=False) def hook(model): eva.assess(model) save_model(model, "./model/model_%s_%i.pkl" % (mdname, model.current)) mf = poi.BPR(train_cks, learn_rate = 0.1, reg_user=0.08, reg_item=0.08, ) mf.train(after=hook)
import poi from utils import Filename from utils import setup_log from utils import save_model if __name__ == "__main__": mdname = "bpr" fn = Filename("foursquare") setup_log(fn.log(mdname)) train_cks = poi.load_checkins(open(fn.train)) test_cks = poi.load_checkins(open(fn.test)) eva = poi.Evaluation(test_cks, full=False) def hook(model): eva.assess(model) save_model(model, "./model/model_%s_%i.pkl" % (mdname, model.current)) mf = poi.BPR( train_cks, learn_rate=0.1, reg_user=0.08, reg_item=0.08, ) mf.train(after=hook)