def test_eval(dir): seperator = "*" * 60 print("\n" + seperator) print("Testing test case " + dir) data['corrected_table_name'] = dir + "_corrected" data['distribute_commitments_table'] = dir + "_distribute_commitment" data['corrected_table_name'] = dir + "_corrected" data['billing_export_table_name'] = data[ 'billing_export_dataset_id'] + '.' + dir + "_export" data['load_billing_export_table_name'] = dir + "_export" data['commitment_table_name'] = dir + "_commitment" data['temp_commitments_table_name'] = dir + "_commitment" data['project_label_credit_breakout_table'] = dir + "_project_label_credit" if dir.endswith('_b'): data['cud_cost_attribution_option'] = 'b' else: data['cud_cost_attribution_option'] = 'a' prepare_consolidated_billing(dir, data) dump_result(data['project_id'], data['corrected_dataset_id'], data['corrected_table_name'], "tests/" + dir) retVal = filecmp.cmp("tests/" + dir + "/output_cmp.json", "tests/" + dir + "/expected_output.json", shallow=False) assert retVal == True try: assert retVal == True clean(dir, data) print("\n" + 'Test case ' + dir + ' ... PASSED') except AssertionError as e: print("\n" + 'Test case ' + dir + ' ... FAILED')
def getShare(self, shareID, r=None, asAdmin=False): sharePath = h.makePath(self.sharesPath, shareID) if not os.path.exists(sharePath): return None, None lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_share%s" % h.clean(shareID)) try: lh = h.getLockShared(lf, 5) shareJson = h.loadJsonFile(sharePath) files = shareJson.get("files", None) if files is None: files = shareJson.get("file", None) if files is not None: files = [files] s = share(shareJson["ID"], shareJson["creation"], files, shareJson.get("views", []), shareJson.get("password"), shareJson.get("duration", 0)) if not asAdmin and s.duration > 0 and s.duration + s.creation < h.now( ): rs, rh = None, "Share has expired" else: rs, rh = s, "ok" h.releaseLock(lh) lh = None if rs is None: return rs, rh if r is not None: rs.tag = h.getURLParams(r.url).get("t", None) return rs, rh except: le, lt = h.getLastExceptionAndTrace() return None, le finally: if lh is not None: h.releaseLock(lh)
def getPasswords(self, path): path = h.cleanPath(path) passwordsFile = h.makePath(self.basePath, path, ".password") if not os.path.exists(passwordsFile): return set() passwordsCacheKey = h.makeKeyFromArguments(path) lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_password_%s" % h.clean(path)) try: self.passwordsLock.acquire() if passwordsCacheKey in self.passwordsCache: pc = self.passwordsCache[passwordsCacheKey] if h.getFileModified(passwordsFile) == pc["date"]: return pc["passwords"] lh = h.getLockShared(lf, 5) passwords = set([ p for p in h.readFromFile( h.makePath(self.basePath, path, ".password")).split("\n") if p != "" ]) self.passwordsCache[passwordsCacheKey] = { "passwords": passwords, "date": h.getFileModified(passwordsFile) } return passwords finally: self.passwordsLock.release() if lh is not None: h.releaseLock(lh)
def setUserPassword(self, path, password, r, response=None): path = h.cleanPath(path) cookieKey = "_sf_pass_%s" % h.clean(path if path != "" else "-") r.cookies = dict(r.cookies) r.cookies[cookieKey] = password if response is not None: response.set_cookie(cookieKey, password, max_age=COOKIE_DURATION)
def addShare(self, shareID, paths, duration, password): paths = [path.lstrip("/").rstrip("/") for path in paths] sharePath = h.makePath(self.sharesPath, shareID) password = "" if password is None else password lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_share%s" % h.clean(shareID)) try: lh = h.getLockExclusive(lf, 5) s = share(shareID, h.now(), paths, [], password, duration) h.writeJsonFile( sharePath, { "ID": s.ID, "files": s.files, "creation": s.creation, "views": s.views, "duration": s.duration, "password": s.password }) if self.user is not None: h.changeFileOwner(sharePath, self.user) return s, "ok" except: le, lt = h.getLastExceptionAndTrace() return None, le finally: if lh is not None: h.releaseLock(lh)
def getDefaultAxis(**kwargs): global default_ axisDefault = helper.keyvalToDict(default_['axis']) # Also merge the user specified attributes. axisDefault = helper.keyvalToDict(kwargs.get('axis_attrib', ''), axisDefault) # Overwrite any default by global. for k in axisDefault: if kwargs.get(k): axisDefault[k] = '%s' % helper.clean(kwargs[k]) if k in ['title', 'xlabel', 'ylabel']: axisDefault[k] = '{ %s }' % helper.clean(kwargs[k]) axis = ET.Element('axis', **axisDefault) return axis
def test_cleanRmFile(self): ''' Tests if the file is removed. ''' filename = 'clean_test_file' with open(filename, 'w+') as file: file.write("Testing clean method.") remove = helper.clean(filename) actual = os.path.isfile(filename) expected = False self.assertFalse(actual, expected)
def getContracts(cur, addresses): transferred = 0 counter = 0 for addr in addresses: # checks if it already is in the database if queries.selectAddrContracts(cur, addr) == 0: # inserts the address in etherscan.io's api and opens the page page = helper.getPage( 'https://api.etherscan.io/api?module=contract&action=getsourcecode&address={}&apikey=RMo8wU2K53Mm' .format(addr)) # extracing the contract res = page.json()['result'][0] if not isinstance(res, str): source = res['SourceCode'] # adds to the address to the set, if there is a contract if len(source) > 0: # removing the comments contract = helper.removeComments(source) # gets the code size codesize = len(contract) # create the AST via script helper.saveToFile(addr, contract) ast = helper.createAST(addr) # adds to database. if queries.insertToContracts(cur, addr, codesize, contract, ast, transferred): counter += 1 helper.clean(addr) else: continue else: continue return counter
def removeShare(self, shareID): sharePath = h.makePath(self.sharesPath, shareID) if not os.path.exists(sharePath): raise Exception("Unknown share", shareID) lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_share%s" % h.clean(shareID)) try: lh = h.getLockExclusive(lf, 5) os.remove(sharePath) return True finally: if lh is not None: h.releaseLock(lh)
def _routeShareAdd(self, paths): if not self.ap.isAdmin(request): return self._redirect("/admin") if self.ap.shareForbidden(paths): return self._makeTemplate("forbidden", path=paths) alerts = [] paths = h.decode(paths) paths = paths.split("@@@") shareID = request.form.get("shareID", "") defaultShareID = request.form.get("defaultShareID", h.uniqueIDSmall()) duration = request.form.get("duration", "") durationInSecs = h.parseInt(duration, 0) * 24 * 60 * 60 password = request.form.get("password-share-add", "") shareSubmit = request.form.get("create-share-submit", False) shareForceSubmit = request.form.get("create-share-force-submit", False) needForce = False containers = [p for p in paths if self.ip.isItemContainer(p)] if shareSubmit or shareForceSubmit: if shareID == "": shareID = defaultShareID shareID = h.clean(shareID) if shareID == "": alerts.append( ["Can't create Share", "Share ID provided is invalid."]) else: if not sp.shareExists(shareID) or shareForceSubmit: share, hint = self.sp.addShare(shareID, paths, durationInSecs, password) if share is not None: # alerts.append(["Share created", "The Share %s has been created for %s" % (shareID, path)]) return self._routeShares(alerts, shareAdded=share) else: alerts.append([ "Can't create Share", "Share %s could not be created. Hint: %s" % (shareID, hint) ]) else: alerts.append([ "Can't create Share", "The Share ID %s is already used for %s." % (shareID, ", ".join(paths)) ]) needForce = True return self._makeTemplate("share-add", paths=paths, defaultShareID=defaultShareID, shareID=shareID, duration=duration, alerts=alerts, needForce=needForce, containers=containers)
def getLinks(pageUrl): global pages clean = helper.clean(url) domain = helper.get_domain(clean) html = urlopen(clean) bsObj = BeautifulSoup(html, "lxml") for link in bsObj.findAll("a"): if 'href' in link.attrs: if link.attrs['href'] not in pages: #We have encountered a new page newPage = link.attrs['href'] if helper.valid(newPage, domain): pages.add(newPage) print("%sLink Founds >> %s" % (W, G), newPage) getLinks(newPage)
def crawLink(self, link, mode, outputFile, gd): target_link = clean(link) if mode == "accident": link_mode = "search_result" elif mode == "ordinary": link_mode = "same_side" print("Craw link for: "+target_link) gd.write("**** Craw link for: "+target_link+"\n") links = self.parser.run(target_link, "link", link_mode) #print("craw_link "+''.join(links)) if links is not None: for l in links: if l is not None: gd.write(" + "+l+"\n")
def info(): if request.method == 'POST': hadith = request.form['hadith'] hadith = hp.clean(hadith).strip() hadith_narrators = hp.scan_narrator(narrators, hadith) vectors = np.array([hp.tfbinary(narrators, hadith_narrators)]) vectors = np.array(pca.transform(vectors)) output = nn.predict(vectors)[0] return json.dumps({ 'result': output, 'narrators': hadith_narrators.tolist(), 'cleaned_hadith': hadith }) return json.dumps(None)
def handle_starttag(self, tag, attrs): ''' Overrid of the default function to handle <a> and ??? tags TODO: update this comments when assest handle is done ''' for key, val in attrs: if key == "href": if contain_static(val): # handle static files print("-", val) # show the static file elif tag == "a": # handle links url = urljoin(self.url, val) # append relative path to the root path url = clean(url) # clean up url if valid(url, self.domain): self.urls.append(url) # append url to the return list else: pass
def crawl(self, target_url): target_url = clean(target_url) # clean target_url self.to_visit.append(target_url) # put target_url to to_visit list while len(self.to_visit) > 0: url = self.to_visit.pop(0) # get next url print("The spider is visiting:", url) urls = self.parser.run(url) # parse the url self.visted.add(url) # add this visted url to visted list # Add urls from the praser to to_visit lits # When they are not visited or already in the to_vist list for url in urls: if url not in self.visted and url not in self.to_visit: self.to_visit.append(url) print("The spider has finished crawling the web at {url}".format( url=target_url))
def addNewPassword(self, path, password): path = h.cleanPath(path) if password is None: return False if self.passwordEditForbidden(path): return False lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_password_%s" % h.clean(path)) try: passwordFile = h.makePath(self.basePath, path, ".password") requiredPasswords = list(self.getPasswords(path)) requiredPasswords.append(password) lh = h.getLockExclusive(lf, 5) h.writeToFile(passwordFile, "\n".join(list(set(requiredPasswords)))) return True except: print(h.getLastExceptionAndTrace()) return False finally: if lh is not None: h.releaseLock(lh)
def handle_starttag(self, tag, attrs): if self.whatcrawling_mode == "link": # only run when mode: crawling link if self.num_link < NUM_LINK_EACH_PAGE: # For craw link handle <a> and ??? tags for key, val in attrs: if key == "href": if containStatic(val): pass else: # handle links link = urljoin(self.link, val) # append relative path to the root path link = clean(link) # clean up link if (self.link_mode == "search_result" and not checkIsExceptDomain(link)) or (self.link_mode == "same_side" and sameDomain(link, self.domain)): if link.lower() not in self.arr_crawled: self.arr_links.append(link) # append link to the return list self.num_link += 1 self.arr_crawled.append(link.lower()) '''
def saveShare(self, s: share): lh, lf = None, h.makePath(h.LOCKS_FOLDER, "_sfl_share%s" % h.clean(s.ID)) try: lh = h.getLockExclusive(lf, 5) sharePath = h.makePath(self.sharesPath, s.ID) h.writeJsonFile( sharePath, { "ID": s.ID, "files": s.files, "creation": s.creation, "views": s.views, "duration": s.duration, "password": s.password }) if self.user is not None: h.changeFileOwner(sharePath, self.user) return True except: le, lt = h.getLastExceptionAndTrace() return False finally: if lh is not None: h.releaseLock(lh)
def test_cleanDiffTypes(self): ''' Testing if clean handles if it gets a directory. ''' actual_dir = helper.clean('test_dir') expected = False self.assertFalse(actual_dir, expected)
def test_cleanFileNonexisting(self): ''' Tests if it crashes if the file does not exist. ''' actual = helper.clean('clean_test_nofile') expected = False self.assertFalse(actual, expected)
def tearDownClass(cls): clean()
reviewcount = 0 successcount = 0 #iterable = readable_data.find_all('review') iterable = readable_data.findall('review') #for review in iterable: for review in iterable: reviewcount += 1 # write review to file try: review_text = review.find('review_text').text review_text = helper.clean( review_text ) # remove newlines, quotation marks, unicode review_text = review_tokenizer.cleanOnereview( review_text, removesinglewords=False ) # tokenize into a list of sentences review_text = " <eos> ".join( review_text) # join with <eos> tags # TODO: evaluate sentiment classification performance when using <eos> tag if include_title: review_title = review.find('title').text review_title = helper.clean(review_title) review_title = review_tokenizer.cleanOnereview( review_title, removesinglewords=False) review_title = " ".join(review_title)
def run_model(training_path, test_path, v_type, model_type, smooth_value): trace_file = open("trace_" + str(v_type) + "_" + str(model_type) + "_" + str(smooth_value) + ".txt", "w", encoding="utf-8") text_body = parseFile(training_path) generate_v(text_body, v_type) tweets = helper.clean(v_type, test_path) total = 0 correct = 0 if model_type == 1: unigramMap = unigram.generate_unigram(v0, v_type, smooth_value) for tweet in tweets: result = unigram.make_guess(tweet, unigramMap) generate_trace_file.genetate_trace_file(trace_file, result) generate_trace_file.count_result(result) total += 1 if result['isCorrect']: correct += 1 elif model_type == 2: bigramMap = bigram.generate_bigram(v0, v_type, smooth_value) for tweet in tweets: result = bigram.make_guess(tweet, bigramMap) generate_trace_file.genetate_trace_file(trace_file, result) generate_trace_file.count_result(result) total += 1 if result['isCorrect']: correct += 1 elif model_type == 3: trigramMap = trigram.generate_trigram(v0, v_type, smooth_value) for tweet in tweets: result = trigram.make_guess(tweet, trigramMap) generate_trace_file.genetate_trace_file(trace_file, result) generate_trace_file.count_result(result) total += 1 if result['isCorrect']: correct += 1 elif model_type == 4: bigramMap = bigram.generate_bigram(v0, v_type, 0.00000000000001) trigramMap = trigram.generate_trigram(v0, v_type, 0.00000000000001) emojis_count = helper.generate_emoji_count(text_body) for tweet in tweets: result = byom.make_guess(tweet, bigramMap, trigramMap, emojis_count) generate_trace_file.genetate_trace_file(trace_file, result) generate_trace_file.count_result(result) total += 1 if result['isCorrect']: correct += 1 trace_file.close() accuracy = correct / total generate_trace_file.generate_eval(v_type, model_type, smooth_value, accuracy, total) test = generate_trace_file.c print('total: ', total) print('accuracy: ', accuracy) print(correct / total)
def test_clean(self): assert clean("http://google.com/#123123") == "http://google.com" assert clean("gocardless.com/") == "http://gocardless.com" print "[Test] test_clean() pass"
def crawContent(self, link): target_link = clean(link) print("Craw content for: "+target_link) content = self.parser.run(target_link, "content", "") return content
def readData(user, helper, path_to_data): fbData = open(path_to_data+"/Facebook/U"+str(user)+".txt").read() fbData = helper.easyClean(fbData) fbData = helper.stem(fbData) # print "Facebook data: "+str(len(fbData.split()))+" words" linkedInData = open(path_to_data+"/LinkedIn/U"+str(user)+".html").read() linkedInExtractions = {} title_pattern = re.compile('<p.*class="title"*.>(.*?)<\/p>') industry_pattern = re.compile('<a.*?name="industry".*?>(.*?)<\/a>') summary_pattern = re.compile('<div class="summary"><p dir="ltr" class="description">([\S\s]*?)<\/div>') description_pattern = re.compile('dir="ltr" class="description">([\S\s]*?)<\/p>') interrests_pattern = re.compile('<li><a title="Find users with this keyword" href=".*?">([\S\s]*?)<\/a>') skills_pattern = re.compile('data-endorsed-item-name="(.*?)">') schools_pattern = re.compile('school-name">(.*?)<\/a>') majors_pattern = re.compile('<span class="major"><a .*?>(.*?)<\/a>') positions_pattern = re.compile('<a title="Learn more about this title" href=.*?>(.*?)<\/a>') job_descriptions_pattern = re.compile('<p dir="ltr" class="description summary-field-show-more">(.*?)<\/p>') # This one might not be needed others_also_viewed_people_in_pattern = re.compile('<p class="browse-map-title">(.*?)<\/p>') linkedInDescription = description_pattern.findall(linkedInData) linkedInTitle = title_pattern.findall(linkedInData) linkedInIndustry = industry_pattern.findall(linkedInData) if(len(linkedInIndustry) == 0): linkedInExtractions["industry"] = "" else: linkedInExtractions["industry"] = linkedInIndustry[0] if(len(linkedInTitle) == 0): linkedInExtractions["title"] = "" else: linkedInExtractions["title"] = linkedInTitle[0] # linkedInExtractions["title"] = title_pattern.findall(linkedInData)[0] # linkedInExtractions["industry"] = industry_pattern.findall(linkedInData)[0] if(len(linkedInDescription) == 0): linkedInExtractions["description"] = "" else: linkedInExtractions["description"] = linkedInDescription[0] # linkedInExtractions["description"] = description_pattern.findall(linkedInData)[0] linkedInExtractions["interrests"] = interrests_pattern.findall(linkedInData) linkedInExtractions["skills"] = skills_pattern.findall(linkedInData) linkedInExtractions["schools"] = schools_pattern.findall(linkedInData) linkedInExtractions["majors"] = majors_pattern.findall(linkedInData) linkedInExtractions["positions"] = positions_pattern.findall(linkedInData) linkedInExtractions["jobDescriptions"] = job_descriptions_pattern.findall(linkedInData) # This one might not be needed linkedInExtractions["othersAlsoViewed"] = others_also_viewed_people_in_pattern.findall(linkedInData) for i in linkedInExtractions: if(type(linkedInExtractions[i])==str): linkedInExtractions[i] = helper.removeStopwords(helper.stem(helper.easyClean(linkedInExtractions[i]))) elif(type(linkedInExtractions[i]) == list): for j in range(0, len(linkedInExtractions[i])): linkedInExtractions[i][j] = helper.removeStopwords(helper.stem(helper.easyClean(linkedInExtractions[i][j]))) # print "LinkedIn data: "+str(len(linkedInExtractions))+" catergories" path_to_tweets = path_to_data+"/Twitter/U"+str(user) twitterExtractions = {} tweetTexts = [] # Read tweets for twitterJsonFile in os.listdir(path_to_tweets): if(twitterJsonFile != ".DS_Store"): try: tweet_data = json.load(open(path_to_tweets+"/"+twitterJsonFile)) for i in range(0, len(tweet_data)-1): if(tweet_data[i]["in_reply_to_status_id"]!=None): tweetTexts.append(helper.clean(tweet_data[i]["text"])) except: pass # print "No twitter data for user "+str(user) twitterExtractions["tweets"] = tweetTexts # print "Extracted tweets: "+str(len(twitterExtractions["tweets"])) return {"linkedInData": linkedInExtractions, "facebookData":fbData, "twitterData":twitterExtractions}
def crawTitle(self, link): target_link = clean(link) print("Craw title for: "+target_link) title = self.parser.run(target_link, "title", "") return title
def getUserPassword(self, path, r: request): path = h.cleanPath(path) path = h.clean(path if path != "" else "-") cookieKey = "_sf_pass_%s" % path if cookieKey in r.cookies: return r.cookies[cookieKey] else: return None