def main(argv): # get all files files = sql.fetch_all(None, None, "SELECT * FROM `raws`") for f in files: print("Processing %s" % f['filename']) raw_id = f['id'] # preprocess lines = [] try: line_nr = 0 for line in f['data'].split("\r\n"): if len(line) == 0: continue line_nr += 1 line = line.rstrip('\r\n') line = re.split(r'\t|;', line) line = preprocess_line(line) lines.append(line) except: print "%d: %s" % (line_nr, line) raise # fix! line_nr = 0 try: for line in lines: line_nr += 1 phenotype = format_line(line) # create a readable program # look up the phenotype id based on the line number try: if (phenotype['entity_id'] == 808 and phenotype['value_id'] == 178): continue phenotype_id = sql.fetch_all('phenotype_raws', { 'line_nr': line_nr, 'raw_id': raw_id })[0]['phenotype_id'] except: print "%d: %d" % (line_nr, raw_id) raise # get the phenotype_%s % (plant, sample, aliquot) id, if any ph_plant = sql.fetch('phenotype_plants', phenotype_id, 'phenotype_id') ph_sample = sql.fetch('phenotype_samples', phenotype_id, 'phenotype_id') ph_aliquot = sql.fetch('phenotype_aliquots', phenotype_id, 'phenotype_id') # check where the link should belong and remove the others, if any if ora_sql.is_plant(phenotype['sample_id']) or ora_sql.was_plant(phenotype['sample_id']): if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample['id'] if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot['id'] elif ora_sql.is_sample(phenotype['sample_id']): if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot['id'] if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant['id'] elif ora_sql.is_aliquot(phenotype['sample_id']): if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample['id'] if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant['id'] else: print "%s NOT found!!" % phenotype['sample_id'] except: progress("%d: %s" % (line_nr, line)) raise
def countPending(): results = fetch('select count(*) from unsubs') print results results = fetch('select count(hash) from unsubs') print results results = fetch('select count(distinct unsubhash) as b from anonymousanalytics group by emailhash order by b desc') #print results results = fetch('select count(distinct emailhash ) from anonymousanalytics ') print results
def deleteReadEmail17days(): results = fetch('select email from readmail') print results[-10:] start = 6000 for r in results: commit('delete from readmail where email=%s',str(start)) start += 1 results = fetch('select email from readmail') print results[-10:]
def deleteLastReads(): results = fetch('select email from readmail') print results[-10:] total = len(results) for r in results[total-150:]: commit('delete from readmail where email=%s',r[0]) results = fetch('select email from readmail') print results[-10:] #deleteLastReads() #deleteReadEmail17days()
def printAnalytics(): log.tid = newHash() results = fetch('select * from analytics') log.info('all analytics', results) results = fetch('select count(*) from unsubs') log.info('current unsubs', results) log.info('print analytics total, successful, all broken') results = fetch('select count(*) from analytics') log.info('total', results) results = fetch('select count(*) from analytics where success=1') log.info('successful', results) results = fetch('select email, url from analytics where success=0') log.info(results) log.info('success / not success for william.k.dvorak') log.info(getAnalyticsForEmail('*****@*****.**'))
def getFive(): # random order in case there's two slaves, don't likely grab the same unsub in high volume results = fetch( 'select url, email, hash from unsubs order by RAND() limit 1') s = set() for r in results: s.add(str(r[2])) origSet = set(s) if not s: return [], origSet s = str(list(s)).replace('[', '(').replace(']', ')') results = fetch('select url, email, hash from unsubs where hash in ' + s) l = list() for r in results: l.append(UnSub(r[0], r[1], r[2])) return l, origSet
def deleteAllUnsubs(): results = fetch('select hash from unsubs') log.info('deleting all unsubs with # unsubs ' + str(len(results))) if len(results) < 15: for r in results: hh = r[0] commit('delete from unsubs where hash=%s', hh)
def getAnalyticsForEmail(email): digest = hashEmail(email) results = fetch( 'select count(*) from anonymousanalytics where emailhash=%s', digest) total = results[0][0] results = fetch( 'select count(*) from anonymousanalytics where emailhash=%s and success=1', digest) successful = results[0][0] if email == 'admin': results = fetch('select count(*) from anonymousanalytics') total = results[0][0] results = fetch( 'select count(*) from anonymousanalytics where success=1') successful = results[0][0] if email == 'admin24': now = str(datetime.datetime.now() - timedelta(hours=24)) results = fetch( 'select count(*) from anonymousanalytics where stamp > %s', now) total = results[0][0] results = fetch( 'select count(*) from anonymousanalytics where success=1 and stamp > %s', now) successful = results[0][0] return [str(int(successful)), str(int(total) - int(successful))]
def main(argv): parser = argparse.ArgumentParser(description='') parser.add_argument('files', nargs='+') parser.add_argument('--standortid', type=int) parser.add_argument('--pages', type=int, default=1) args = parser.parse_args(argv) for full_fn in args.files: fn = ntpath.basename(full_fn) # look up the file id file_id = sql.fetch('ufiles', fn, id_key='name') if not file_id: print "File '%s' not found in DB, skipping" % fn else: data = [] headers = [] for page in xrange(args.pages): data, headers = p_xls.read_xls_data(full_fn, page) lines = 0 # administration: number of succesfully inserted lines for row in data: standortid = -1 if hasattr(row, 'StandortID'): standortid = getattr(row, 'StandortID') elif args.standortid != None: standortid = args.standortid else: sys.stderr.write('No StandortID found!') exit() rs = sql.fetch_all('temps', { 'datum': getattr(row, 'Datum'), 'location_id': standortid }) if rs != False:# and len(rs) == 1: for i in xrange(len(rs)): if (sql.insert('ufiletemps', { 'ufile_id': file_id['id'], 'temp_id': rs[i]['id'] })): lines += 1 else: print "%d,%d" % (file_id['id'], rs[i]['id']) print "Inserted %d/%d of page %d" % (lines, len(data), page) sql.commit() # after each file return None
def allUnsuccessful(): results = fetch('select email, url from analytics where success=0') ss = dict() for r in results: ss[str(r[1])] = str(r[0]) i = 0 low = 200 high = low+10 print len(ss.keys()) for k,v in ss.iteritems(): if i > low: os.system('open '+k) print v print k i+=1 if i > high: break
def anonymousAnalytics(email, unsubhash, success=False): digest = hashEmail(email) now = str(datetime.datetime.now()) results = fetch( 'select unsubhash, success from anonymousanalytics where unsubhash=%s', (unsubhash)) success = int(success) if results: if int(results[0][1]) == 0 and success: commit( 'update anonymousanalytics set success=1 where unsubhash=%s', (unsubhash)) else: log.info('unsub hash is still failing, do not update analytics', unsubhash) else: commit( 'insert into anonymousanalytics (emailhash, unsubhash, success, stamp) values (%s, %s, %s, %s)', (digest, unsubhash, str(success), now))
def handle(request): info = [] # # NEW JOB REQUESTED # if request[0] == 'n': print "Got a request for a new job" data = sql.fetch("SELECT * from openjobs WHERE jobtype > -1 AND tid = (SELECT max(tid));")[0] # Build the query to move job to pending table pending_q = ("INSERT INTO pendingjobs(tid, jobtype, jobdesc, jobdata) VALUES (%s, %s, '%s', '%s');" % (data[0], data[1], data[2], data[3])) # Build info with delimeters for transit for el in data: el = str(el) info.append(el) info.append("^") info.pop(-1) # Remove the last item, as its a spare delimeter # Move job to pending table print "> Moving job with table ID [" + str(info[0]) + "] to the pending jobs table" res = sql.update(pending_q) if res == -1: print ">> There was an error moving job to the pending table! Changes reverted" elif res == 1: print ">> Job moved to pending table" # Remove job from open table print "> Removing job with table ID [" + str(info[0]) + "] from open jobs"; res1 = sql.update("delete from openjobs where tid = %s;" % str(info[0])) if res1 == -1: print ">> There was an error removing the job from the open table! Changes reverted" elif res == 1: print ">> Job removed from open table" # # UPDATE TO JOB REQUESTED # if request[0] == 'u': # EXPECTING : u, tid, type, desc, data # If sorting : data = item,item,item;ORIGINAL_DESC;ORIGINAL_DATA # Insert data into closed print "Got a request to update job" print "> Moving job with table ID [" + str(request[1]) + "] to the closed table" sql.update("INSERT INTO closedjobs(tid, jobtype, jobdesc, jobdata) VALUES (%s, %s, '%s', '%s');" % (request[1], request[2], request[3], request[4])) # Remove item from pending print "> Removing job with table ID [" + str(request[1]) + "] from the pending jobs"; res1 = sql.update("delete from pendingjobs where tid = %s;" % str(request[1])) if res1 == -1: print ">> There was an error removing the job from the pending table! Changes reverted" elif res1 == 1: print ">> Job removed from pending table" # Thank the client info.append("skynet thanks you for your service") # # REPORT REQUESTED # if request[0] == 'r': info.append("REQ FOR REPORT") print "Got a request for a report" # # JOIN AND SEND DATA # reply = ''.join(info).replace('\t', ' ') reply += '\t' return reply
def percentSuccess(): results = fetch('select success, count(*) from anonymousanalytics group by success') print 'success ', results dt = datetime.datetime.now() - timedelta(days=20) results = fetch('select success, count(*) from anonymousanalytics where stamp > %s group by success',str(dt)[:11]) print 'success within 20 days', results
def index(): # Bootstrap alerts and errors that will popup on the top of the screen alerts = [] errors = [] # Arguments being passed through to the html page htmlArguments = {} # Checks if an error is detected in the HTML arguments (generally the text after '?' in the domain) and adds the information to the Bootstrap errors if 'error' in request.args: errors.append({ 'error': request.args['error'], 'error_description': request.args['error_description'] }) # Checks if the user isn't logged in locally if not session.get("user"): # Creates a state for the session for the user session["state"] = str(uuid.uuid4()) # Creates the OAuth2 redirect URL for the user to be logged into, which is passed through into the html arguments auth_url = _build_auth_url(scopes=app_config.SCOPE, state=session["state"]) htmlArguments['auth_url'] = auth_url else: # Gets email of the user, and looks up user in the database emailOfUser = session["user"]["preferred_username"] databaseInfo = sql.fetch(emailOfUser).fetchone() # if user is not found in database or invalid refresh token if not databaseInfo or not databaseInfo[0]: # logs out user return redirect(url_for("logout")) # Checks if user requires SMS verification, by searching if user has phone number saved, but not verified requireSMSVerification = databaseInfo[1] and not databaseInfo[5] # Checks if the user wishes to receive Microsoft Teams notifications getTeamsNotifications = databaseInfo[2] # Checks if the user wishes to send and receive emails over SMS emailOverSMS = databaseInfo[4] # Prefills phone number on HTML form if phone number is already in the database if databaseInfo[1]: htmlArguments['prefilledPhoneNumber'] = databaseInfo[1] else: htmlArguments['prefilledPhoneNumber'] = "" # Checks if the user has made a POST request if request.method == 'POST': # Checks if the user pressed the update button if 'updateButton' in request.form: # Gets the phone number from the form phoneNumber = request.form['phoneNumber'] # Gets the verification code from the form if required if requireSMSVerification: verificationCodeFromUser = request.form[ 'smsVerificationCode'] # Checks if user attempted entering a verification code if verificationCodeFromUser: # Clears verification code and sets phone as verified if verified code is correct if verificationCodeFromUser == databaseInfo[6]: sql.updateVal(emailOfUser, 'VerifiedPhone', True) sql.updateVal(emailOfUser, 'VerificationCode', None) requireSMSVerification = False send( "OfficeConnected: You have successfully connected your phone! Reply with 'CMD' to get a full list of commands you can do with OfficeConnected", databaseInfo[1]) else: # Tells user that verification code is wrong through Bootstrap errors.append({ "error": "Invalid SMS verification code", "error_description": "You have entered an invalid verification code, make sure you've typed the right characters. If you would like a new verification code, you can reply 'LINK' to the SMS message" }) # Checks if user is trying to update phone the phone number to a different one from database if databaseInfo[1] != phoneNumber and phoneNumber: # Checks if updated phone number already exists in the database and tells user error through Bootstrap if sql.fetchPhone(phoneNumber).fetchone(): errors.append({ "error": "Phone number already exists", "error_description": "An account with that phone number already exists in our database, please enter a valid phone number, or to unlink that number, text 'UNLINK' to +1 (844)-961-2701" }) else: # Updates unverified phone number in database sql.updateVal(emailOfUser, 'PhoneNumber', phoneNumber) sql.updateVal(emailOfUser, 'VerifiedPhone', False) sql.updateVal(emailOfUser, 'VerificationCode', None) # Replace html argument to updated phone number htmlArguments['prefilledPhoneNumber'] = phoneNumber requireSMSVerification = True # Notifying user over text and Bootstrap alert to verify phone number send( "OfficeConnected: Verify your phone by responding with the message 'LINK' to receive your verification code", phoneNumber) alerts.append( "A message has been sent to your phone. Please verify your phone by responding with the message 'LINK' and entering your verification code" ) # Updates if the user wants to get Teams notifications based on if the getTeamsNotification checkbox is checked in HTML if 'getTeamsNotifications' in request.form and request.form[ 'getTeamsNotifications'] == 'on': getTeamsNotifications = True sql.updateVal(emailOfUser, 'GetSMSTeamNotifications', True) else: getTeamsNotifications = False sql.updateVal(emailOfUser, 'GetSMSTeamNotifications', False) # Updates if the user wants to allow email over SMS based on if the emailOverSMS checkbox is checked in HTML if 'emailOverSMS' in request.form and request.form[ 'emailOverSMS'] == 'on': emailOverSMS = True sql.updateVal(emailOfUser, 'EmailOverSMS', True) else: emailOverSMS = False sql.updateVal(emailOfUser, 'EmailOverSMS', False) # Checks if the deleteAccount button has been pressed, and clears user from database elif 'deleteAccount' in request.form: sql.delete(emailOfUser) return redirect(url_for("logout")) # sets respective HTML arguments to their variables on Python to be passed through in Flask htmlArguments['getTeamsNotificationsBool'] = getTeamsNotifications htmlArguments['emailOverSMSBool'] = emailOverSMS htmlArguments['requireSMSVerification'] = requireSMSVerification # Passes through basic user info to Flask htmlArguments['user'] = session['user'] # Passes through Bootstrap alerts and errors to HTML htmlArguments['errors'] = errors htmlArguments['alerts'] = alerts # Renders the HTML, with htmlArguments as it's arguments return render_template('home.html', **htmlArguments)
def numUnsubs(): results = fetch('select hash from unsubs') num = len(results) return num
def getProblem(): hashh = 'lrnbgg19' results = fetch('select * from unsubs where hash="lrnbgg19"') print results results = fetch('select count(*) from unsubs ') print results
def main(level='para', process_mode=''): tokenize = lambda doc: doc.lower().split(" ") if level == 'para': sql = "SELECT tipcands FROM answers" res = fetch(sql) docs = [] for item in res: item = item['tipcands'] item = item.replace('\n', '') item = item.replace('\r', '') item = item.replace('\r\n', '') item = item.strip() item = re.sub('(<p>|</p>)', '', item) item = re.sub('<[^>]*>', ' ', item) item = process_raw_txt(item, mode=process_mode) item = remove_puncs(item) docs.append(item) sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize) sklearn_tfidf.fit(docs) else: sql = "SELECT tipcands FROM answers" res = fetch(sql) docs = [] for item in res: item = item['tipcands'] item = item.replace('\n', '') item = item.replace('\r', '') item = item.replace('\r\n', '') item = item.strip() item = re.sub('(<p>|</p>)', '', item) item = re.sub('<[^>]*>', ' ', item) sents = sent_tokenize(item) for sent in sents: sent = process_raw_txt(sent, mode=process_mode) sent = remove_puncs(sent) docs.append(sent) sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize) sklearn_tfidf.fit(docs) f = open(home + 'datasets/{}_tip.pos'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() pos_ids = [line.replace('\n', '').split('\t')[0] for line in texts] f = open(home + 'datasets/{}_tip.neg'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() neg_ids = [line.replace('\n', '').split('\t')[0] for line in texts] f = open(home + 'datasets/{}_tip.ds'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() texts = [line.replace('\n', '').split('\t') for line in texts] id2text = {idx: [remove_puncs(txt)] for idx, txt in texts} matrix = [] for idx, txt in enumerate(texts): if level == 'para': sql = "SELECT * FROM paragraphs where `ID`=%s" elif level == 'sent': sql = "SELECT * FROM sentences where `ID`=%s" try: item = fetch(sql, txt[0])[0] except Exception as e: print(txt) print(fetch(sql, txt[0])) raise e parent_id = item['parent_id'] temp = [] temp.append(int(txt[0])) temp.append(item['score']) #answer score if item['LastEditDate'] == '': adate = item['CreationDate'] else: adate = item['LastEditDate'] sql = "SELECT * FROM threads where `id` = %s" q = fetch(sql, item['parent_id']) if q[0]['LastEditDate'] == '': qdate = q[0]['CreationDate'] else: qdate = q[0]['LastEditDate'] adate = datetime.strptime(adate.split('.')[0], "%Y-%m-%dT%H:%M:%S") qdate = datetime.strptime(qdate.split('.')[0], "%Y-%m-%dT%H:%M:%S") diff = adate - qdate adiff = diff.total_seconds() / 3600.0 temp.append(adiff) #answer time difference to question now = datetime.strptime(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), "%Y-%m-%dT%H:%M:%S") aage = (now - adate).total_seconds() / 3600.0 temp.append(aage) #answer age temp.append(q[0]['score']) #question score temp.append(q[0]['FavoriteCount']) #question favorites sql = "SELECT * FROM users where `ID` = %s" r = fetch(sql, item['OwnerUserId']) if len(r) > 0: temp.append(r[0]['Reputation']) #question use reputations else: temp.append(0) temp.append(q[0]['ViewCount']) #question views temp.append((now - qdate).total_seconds() / 3600.0) #question age tokens = id2text[txt[0]][0].split() temp.append(len(tokens)) #number of tokens in sentence sql = "SELECT * FROM answer_body where `ID` = %s" body = fetch(sql, item['answer_id'])[0]['body'] cnt = 0 try: m = re.findall( r'(?:<blockquote>(?:(?!\n<p>|\n<ul>|\n<a href|\n<ol>|\n<h1>|\n<h2>).)*</blockquote>)', body, re.DOTALL) for s in m: body = body.replace(s, '') lis = re.findall( r'(?:<pre%s*><code>%s*</code></pre>)|(?:<p>%s*</p>)|(?:<ol>%s*</ol>)|(?:<ul>%s*</ul>)' % (c, c, c, c, c), body, re.DOTALL) l = len(lis) i = 0 paras = [] while i < l: if not lis[i].startswith('<pre'): code = [] trigger = False for j in range(i + 1, l): trigger = True if lis[j].startswith('<pre'): code.append(lis[j]) else: break m = '\n$$$$$\n'.join(code) paras.append([lis[i], m]) if trigger: i = j else: i = i + 1 else: i = i + 1 for unit in paras: para = unit[0] para = para.replace('\n', '') para = para.replace('\r', '') para = para.replace('\r\n', '') para = para.strip() para = re.sub('(<p>|</p>)', '', para) para = re.sub('<[^>]*>', ' ', para) cnt = cnt + len(para.split()) except Exception as e: raise e temp.append(cnt) #answer size question = [] sql = "SELECT * FROM threads WHERE `id` =%s" res = fetch(sql, parent_id)[0] try: body = res['Body'].lower() m = re.findall( r'(?:<blockquote>(?:(?!\n<p>|\n<ul>|\n<a href|\n<ol>|\n<h1>|\n<h2>).)*</blockquote>)', body, re.DOTALL) for s in m: body = body.replace(s, '') lis = re.findall( r'(?:<pre%s*><code>%s*</code></pre>)|(?:<p>%s*</p>)|(?:<ol>%s*</ol>)|(?:<ul>%s*</ul>)' % (c, c, c, c, c), body, re.DOTALL) l = len(lis) i = 0 paras = [] while i < l: if not lis[i].startswith('<pre'): code = [] trigger = False for j in range(i + 1, l): trigger = True if lis[j].startswith('<pre'): code.append(lis[j]) else: break m = '\n$$$$$\n'.join(code) paras.append([lis[i], m]) if trigger: i = j else: i = i + 1 else: i = i + 1 for unit in paras: para = unit[0] para = para.replace('\n', '') para = para.replace('\r', '') para = para.replace('\r\n', '') para = para.strip() para = re.sub('(<p>|</p>)', '', para) para = re.sub('<[^>]*>', ' ', para) # para = ' '.join(para.split()) sents = sent_tokenize(para) sents = [remove_puncs(sent)\ for sent in sents if len(sent.split()) > 5] if len(sents) > 0: question = question + sents title = res['Title'].lower() question.append(remove_puncs(title)) except Exception as e: raise e doc = item['tipcands'] if level == 'para': para = item['tipcands'] para = para.replace('\n', '') para = para.replace('\r', '') para = para.replace('\r\n', '') para = para.strip() para = re.sub('(<p>|</p>)', '', para) para = re.sub('<[^>]*>', ' ', para) para = ' '.join(para.split()) sents = sent_tokenize(para) sents = [remove_puncs(sent)\ for sent in sents if len(sent.split()) > 5] s = 0.0 for doc0 in question: for doc1 in sents: vec0 = sklearn_tfidf.transform([doc0]).toarray()[0] vec1 = sklearn_tfidf.transform([doc1]).toarray()[0] s = s + cosine_similarity(vec0, vec1) if s != 0.0: s = s / (len(question) * len(sents)) temp.append(s) else: doc1 = remove_puncs(doc) s = 0.0 for doc0 in question: vec0 = sklearn_tfidf.transform([doc0]).toarray()[0] vec1 = sklearn_tfidf.transform([doc1]).toarray()[0] s = s + cosine_similarity(vec0, vec1) if s != 0.0: s = s / len(question) temp.append(s) postags = [ pair[1] for pair in nltk.pos_tag(word_tokenize(' '.join(id2text[txt[0]]))) ] counter = collections.Counter(postags) if counter["NN"] > 0: temp.append(counter["NN"]) #number of nouns else: temp.append(0) if postags[0] == "NN": temp.append(1) #sentence starts with noun else: temp.append(0) codes = re.findall(r'(?:<code>(?:(?!<code>).)*</code>)', doc, re.DOTALL) num = sum([ len(code.replace('<code>', '').replace('</code>', '')) for code in codes ]) temp.append(num) #number of characters that are code counter = collections.Counter(tokens) temp.append(counter['be']) matrix.append(temp) print(np.array(matrix).shape) features = {} for f in matrix: ids = str(int(f[0])) docs = id2text[ids] features[ids] = [f[1:], docs] cv = 10 def build_data_cv(cv=cv): """ Loads data and split into 10 folds. """ process_mode = '' with open( home + 'datasets/cv-{}-{}-dataset.pickle'.format(level, process_mode), 'rb') as f: revs = pickle.load(f) temp = [] for item in revs: ids = item['id'] fset = features[ids][0] txt = features[ids][1] datum = { "y": item['y'], # "text": txt, "id": ids, "fset": fset, "split": item['split'] } temp.append(datum) return temp entirefset = build_data_cv() with open( home + 'datasets/feature_set/{}-{}-sise-entirefset.pickle'.format( level, process_mode), 'wb') as f: pickle.dump(entirefset, f)
r'.*<code>.*\b%s\b.*</code>.*' % key,\ r'.*<a.*href.*%s\.php.*>.*</a>.*' % key] for i in range(len(re_body)): regexp = re.compile(re_body[i]) if regexp.search(str_body): is_found = True break return is_found posts = TextLoader(home + 'raw_data/Posts.xml') keywords = posts.read(home + 'keywords.ls') keywords = set(keywords) sql = "SELECT `parentid`, `acceptedanswerid` FROM kws2TID" f = fetch(sql) threadsid = [ids['parentid'] for ids in f] acceptedids = [ ids['acceptedanswerid'] for ids in f if ids['acceptedanswerid'] != -1 ] acceptedids = set(acceptedids) def partitions(pmids, n): "Partitions the pmids into n subsets" nodes_iter = iter(pmids) while True: partition = tuple(itertools.islice(nodes_iter, n)) if not partition: return yield partition
def main(level='para', process_mode='', mode='static', word_vectors='nonrand'): print('---{}---{}---'.format(level, process_mode)) f = open(home + 'onehot-{}_paris_cls_id.txt'.format(level), 'r') outs = f.readlines() f.close() one_hot_paris_cls_id = { line.replace('\n', '').split(' ')[1]: line.replace('\n', '').split(' ')[0] for line in outs } f = open( home + 'normal-{}-{}-{}-{}_paris_cls_id.txt'.format( level, mode, word_vectors, process_mode), 'r') outs = f.readlines() f.close() normal_paris_cls_id = { line.replace('\n', '').split(' ')[1]: line.replace('\n', '').split(' ')[0] for line in outs } with open( home + 'datasets/feature_map/{}-{}-{}-{}-fmap.pickle'.format( level, mode, word_vectors, process_mode), 'rb') as handle: cnn_fmap = pickle.load(handle) with open( home + 'datasets/feature_set/{}-{}-sise-entirefset.pickle'.format( level, process_mode), 'rb') as handle: temp = pickle.load(handle) sise_fs = {} for entity in temp: sise_fs[entity['id']] = entity['fset'] with open(home + 'datasets/w2v_model/wv_{}.pickle'.format(process_mode), 'rb') as handle: vocab = pickle.load(handle) embeddings_index = {} for word in vocab: embeddings_index[word] = vocab[word] for word in embeddings_index: img_cols = len(embeddings_index[word]) break tokenize = lambda doc: doc.lower().split(" ") if level == 'para': sql = "SELECT tipcands FROM answers" res = fetch(sql) docs = [] for item in res: item = item['tipcands'] item = item.replace('\n', '') item = item.replace('\r', '') item = item.replace('\r\n', '') item = item.strip() item = re.sub('(<p>|</p>)', '', item) item = re.sub('<[^>]*>', ' ', item) item = process_raw_txt(item, mode=process_mode) item = remove_puncs(item) docs.append(item) sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize) sklearn_tfidf.fit(docs) else: sql = "SELECT tipcands FROM answers" res = fetch(sql) docs = [] for item in res: item = item['tipcands'] item = item.replace('\n', '') item = item.replace('\r', '') item = item.replace('\r\n', '') item = item.strip() item = re.sub('(<p>|</p>)', '', item) item = re.sub('<[^>]*>', ' ', item) sents = sent_tokenize(item) for sent in sents: sent = process_raw_txt(sent, mode=process_mode) sent = remove_puncs(sent) docs.append(sent) sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize) sklearn_tfidf.fit(docs) idf = sklearn_tfidf.idf_ vocabulary = sklearn_tfidf.vocabulary_ templates = [] path = home + 'datasets/ngrams/' for (path, dirs, files) in os.walk(path): for file in files: fp = open(path + file) output = fp.readlines() fp.close() templates = templates + [ item.split('\t')[0].replace('\n', '') for item in output ] f = open(home + 'datasets/{}_tip.pos'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() pos_ids = [line.replace('\n', '').split('\t')[0] for line in texts] f = open(home + 'datasets/{}_tip.neg'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() neg_ids = [line.replace('\n', '').split('\t')[0] for line in texts] f = open(home + 'datasets/{}_tip.ds'.format(level), 'r', encoding='utf-8') texts = f.readlines() f.close() texts = [line.replace('\n', '').split('\t') for line in texts] id2text = {idx: [remove_puncs(txt)] for idx, txt in texts} id2text_with_puncs = {idx: txt for idx, txt in texts} train_set = [remove_puncs(item[1]) for item in texts] unigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(1, 1), min_df=5) unigram_vectorizer.fit(train_set) bigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(2, 2), min_df=5) bigram_vectorizer.fit(train_set) trigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(3, 3), min_df=5) trigram_vectorizer.fit(train_set) postagged_texts = [] for _, txt in texts: postagged_texts.append(' '.join([pair[1] for pair in \ nltk.pos_tag(word_tokenize(remove_puncs(txt)))]).lower()) pos_unigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(1, 1), min_df=5) pos_unigram_vectorizer.fit(postagged_texts) pos_bigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(2, 2), min_df=5) pos_bigram_vectorizer.fit(postagged_texts) pos_trigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(3, 3), min_df=5) pos_trigram_vectorizer.fit(postagged_texts) matrix = [] for idx, txt in enumerate(texts): if level == 'para': sql = "SELECT * FROM paragraphs where `ID`=%s" elif level == 'sent': sql = "SELECT * FROM sentences where `ID`=%s" try: item = fetch(sql, txt[0])[0] except Exception as e: print(txt) print(fetch(sql, txt[0])) raise e temp = [] temp.append(int(txt[0])) #TGrams ngram = unigram_vectorizer.transform(id2text[txt[0]]) temp.append(sum(ngram.tocoo().data)) ngram = bigram_vectorizer.transform(id2text[txt[0]]) temp.append(sum(ngram.tocoo().data)) ngram = trigram_vectorizer.transform(id2text[txt[0]]) temp.append(sum(ngram.tocoo().data)) #POSGrams tags = [ ' '.join([ pair[1] for pair in nltk.pos_tag( word_tokenize(' '.join(id2text[txt[0]]))) ]).lower() ] ngram = pos_unigram_vectorizer.transform(tags) temp.append(sum(ngram.tocoo().data)) ngram = pos_bigram_vectorizer.transform(tags) temp.append(sum(ngram.tocoo().data)) ngram = pos_trigram_vectorizer.transform(tags) temp.append(sum(ngram.tocoo().data)) #Surface tokens = id2text[txt[0]][0].split() temp.append(len(tokens)) #template ids = 0 num_t = 0 wc_pos = 0 maximum = 0 for jdx, t in enumerate(templates): no = t.replace(' ', '\\s') no = no.replace('*', '[\w|\']+') regexp = re.compile(no) if regexp.search(item['tipcands']): ids = ids + (jdx + 1) num_t = num_t + 1 wc_pos = wc_pos + (t.split(' ').index('*') + 1) if len(t.split(' ')) > maximum: maximum = len(t.split(' ')) temp.append(ids) temp.append(num_t) temp.append(wc_pos) temp.append(maximum) temp = temp + list(sise_fs[txt[0]]) temp.append(int(one_hot_paris_cls_id[txt[0]])) temp.append(int(normal_paris_cls_id[txt[0]])) #W2V cnt = 0 cnt1 = 0 uniw2v = np.zeros(img_cols) idfw2v = np.zeros(img_cols) for token in tokens: if token in embeddings_index: uniw2v = np.add(uniw2v, embeddings_index[token]) if token in vocabulary: idfw2v = np.add( idfw2v, idf[vocabulary[token]] * embeddings_index[token]) cnt1 = cnt1 + idf[vocabulary[token]] cnt = cnt + 1 if cnt != 0: uniw2v = 1 / cnt * uniw2v idfw2v = 1 / cnt1 * idfw2v temp = temp + list(uniw2v) + list(idfw2v) + list(cnn_fmap[txt[0]]) matrix.append(temp) print(np.array(matrix).shape) features = {} for f in matrix: ids = str(int(f[0])) docs = id2text[ids] features[ids] = [f[1:], docs] cv = 10 def build_data_cv(cv=cv): """ Loads data and split into 10 folds. """ with open( home + 'datasets/cv-{}-{}-dataset.pickle'.format(level, process_mode), 'rb') as f: revs = pickle.load(f) temp = [] for item in revs: ids = item['id'] fset = features[ids][0] txt = features[ids][1] datum = { "y": item['y'], # "text": txt, "id": ids, "fset": fset, "split": item['split'] } temp.append(datum) return temp entirefset = build_data_cv() with open( home + 'datasets/feature_set/{}-{}-trip-entirefset.pickle'.format( level, process_mode), 'wb') as f: pickle.dump(entirefset, f)
def main(argv): # get all files files = sql.fetch_all(None, None, "SELECT * FROM `raws`") for f in files: print("Processing %s" % f['filename']) raw_id = f['id'] # preprocess lines = [] try: line_nr = 0 for line in f['data'].split("\r\n"): if len(line) == 0: continue line_nr += 1 line = line.rstrip('\r\n') line = re.split(r'\t|;', line) line = preprocess_line(line) lines.append(line) except: print "%d: %s" % (line_nr, line) raise # fix! line_nr = 0 try: for line in lines: line_nr += 1 phenotype = format_line(line) # create a readable program # look up the phenotype id based on the line number try: if (phenotype['entity_id'] == 808 and phenotype['value_id'] == 178): continue phenotype_id = sql.fetch_all('phenotype_raws', { 'line_nr': line_nr, 'raw_id': raw_id })[0]['phenotype_id'] except: print "%d: %d" % (line_nr, raw_id) raise # get the phenotype_%s % (plant, sample, aliquot) id, if any ph_plant = sql.fetch('phenotype_plants', phenotype_id, 'phenotype_id') ph_sample = sql.fetch('phenotype_samples', phenotype_id, 'phenotype_id') ph_aliquot = sql.fetch('phenotype_aliquots', phenotype_id, 'phenotype_id') # check where the link should belong and remove the others, if any if ora_sql.is_plant( phenotype['sample_id']) or ora_sql.was_plant( phenotype['sample_id']): if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample[ 'id'] if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot[ 'id'] elif ora_sql.is_sample(phenotype['sample_id']): if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot[ 'id'] if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant[ 'id'] elif ora_sql.is_aliquot(phenotype['sample_id']): if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample[ 'id'] if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant[ 'id'] else: print "%s NOT found!!" % phenotype['sample_id'] except: progress("%d: %s" % (line_nr, line)) raise