publicSuffixFile = 'publicsuffix.txt' scheme = "dbname=ts_analysis" fileName = "test.txt" limit = 10000 seed = 1234 # resultList = ["invalid URL", "crawler failure", "hostname not found", "authentication required (401)", "proxy error (502/504/52x)", # "timeout", "ok (redirected)", "bad request (400)", "ok", "service unavailable (503)", "page not found (404/410)", # "redirection loop", "server error (500)", "network or protocol error", "forbidden (403)", "other HTTP response"] pages = {} publicSuffixes = loadPublicSuffixes(publicSuffixFile) counter = 0 db = PageDB(scheme) cursor = db.db.cursor() for page in db.get_random_pages(limit, seed, ordered=True, want_links=False): counter += 1 print(counter) originalURL = page.url locale = page.locale url_id = page.page_id[1] result = isNone(page.result) # result = resultList.index(result) detail = isNone(page.detail).lower() html = page.html_content userContent = page.text_content dom_stats = page.dom_stats depth = len(dom_stats.tags_at_depth) NumberOfTagTypes = len(dom_stats.tags)
def isNone(variable): if variable is None: variable = "" return variable publicSuffixFile = "publicsuffix.txt" scheme = "dbname=ts_analysis" fileName = "test.txt" limit = 10000 seed = 1234 pages = {} publicSuffixes = loadPublicSuffixes(publicSuffixFile) db = PageDB(scheme) for page in db.get_random_pages(limit, seed, ordered=True, want_links=False): originalURL = page.url.lower() redirURL = isNone(page.redir_url).lower() locale = page.locale url_id = page.page_id[1] result = isNone(page.result).lower() detail = isNone(page.detail) html = page.html_content userContent = page.text_content dom_stats = page.dom_stats depth = len(dom_stats.tags_at_depth) NumberOfTagTypes = len(dom_stats.tags) numberOfTags = 0 for tag in dom_stats.tags: numberOfTags += dom_stats.tags[tag]
from pagedb import PageDB import time import json import os tfFileName = 'tfidf/tf.json' tfGlobalFileName = 'tfidf/tfGlobal.json' idfRowFileName = 'tfidf/idfRow.json' idfColumnFileName = 'tfidf/idfColumn.json' idfFileName = 'tfidf/idf.json' start_time = time.time() counter = 0 scheme = "dbname=ts_analysis" db = PageDB(scheme) <<<<<<< HEAD limit = 20 document = [] wordtfidf = {} for page in db.get_pages(where_clause = "", limit = limit, ordered = False): ======= limit = 100000 seed = 1234 # TODO need tf global row and tf global column also. #tf global for selecting overall features tfGlobal = {} # entire country vs page matrix idfGlobal = {} # same page different countries
redirDomain = getRegisteredDomain(redirDomain + '.',publicSuffixes) if(redirDomain == -1): redirDomain = redirURL isRedir = (redirDomain != originalDomain) return originalDomain, redirDomain, isRedir publicSuffixFile = 'publicsuffix.txt' scheme = "dbname=ts_analysis" fileName = "test.txt" limit = "1000" pages = {} publicSuffixes = loadPublicSuffixes(publicSuffixFile) db = PageDB(scheme) for page in db.get_pages(where_clause = "", limit = limit, ordered = False): originalURL = page.url locale = page.locale url_id = page.page_id[1] result = page.result detail = page.detail html = page.html_content userContent = page.text_content dom_stats = page.dom_stats depth = len(dom_stats.tags_at_depth) # userContentFeatures, domFeatures = getHtmlFeatures(html) redirURL = page.redir_url originalDomain, redirDomain, isRedir = getDomainRedir(originalURL, redirURL) print(len(pages))