publicSuffixFile = 'publicsuffix.txt'
scheme = "dbname=ts_analysis"
fileName = "test.txt"
limit = 10000
seed = 1234

# resultList =  ["invalid URL", "crawler failure", "hostname not found", "authentication required (401)", "proxy error (502/504/52x)",
# "timeout", "ok (redirected)", "bad request (400)", "ok", "service unavailable (503)", "page not found (404/410)",
# "redirection loop", "server error (500)", "network or protocol error", "forbidden (403)", "other HTTP response"]

pages = {}
publicSuffixes = loadPublicSuffixes(publicSuffixFile)

counter = 0
db = PageDB(scheme)
cursor = db.db.cursor()
for page in db.get_random_pages(limit, seed, ordered=True, want_links=False):
    counter += 1
    print(counter)
    originalURL = page.url
    locale = page.locale
    url_id = page.page_id[1]
    result = isNone(page.result)
    # result = resultList.index(result)
    detail = isNone(page.detail).lower()
    html = page.html_content
    userContent = page.text_content
    dom_stats = page.dom_stats
    depth = len(dom_stats.tags_at_depth)
    NumberOfTagTypes = len(dom_stats.tags)
def isNone(variable):
    if variable is None:
        variable = ""
    return variable


publicSuffixFile = "publicsuffix.txt"
scheme = "dbname=ts_analysis"
fileName = "test.txt"
limit = 10000
seed = 1234

pages = {}
publicSuffixes = loadPublicSuffixes(publicSuffixFile)

db = PageDB(scheme)
for page in db.get_random_pages(limit, seed, ordered=True, want_links=False):
    originalURL = page.url.lower()
    redirURL = isNone(page.redir_url).lower()
    locale = page.locale
    url_id = page.page_id[1]
    result = isNone(page.result).lower()
    detail = isNone(page.detail)
    html = page.html_content
    userContent = page.text_content
    dom_stats = page.dom_stats
    depth = len(dom_stats.tags_at_depth)
    NumberOfTagTypes = len(dom_stats.tags)
    numberOfTags = 0
    for tag in dom_stats.tags:
        numberOfTags += dom_stats.tags[tag]
Exemple #3
0
from pagedb import PageDB
import time
import json
import os


tfFileName = 'tfidf/tf.json'
tfGlobalFileName = 'tfidf/tfGlobal.json'
idfRowFileName = 'tfidf/idfRow.json'
idfColumnFileName = 'tfidf/idfColumn.json'
idfFileName = 'tfidf/idf.json'
     
start_time = time.time()
counter = 0
scheme = "dbname=ts_analysis"
db = PageDB(scheme)
<<<<<<< HEAD
limit = 20
document = []
wordtfidf = {}
for page in db.get_pages(where_clause = "", limit = limit, ordered = False):
=======
limit = 100000
seed = 1234

# TODO need tf global row and tf global column also.
#tf global for selecting overall features
tfGlobal = {}
# entire country vs page matrix
idfGlobal = {}
# same page different countries
		redirDomain = getRegisteredDomain(redirDomain + '.',publicSuffixes)
		if(redirDomain == -1):
			redirDomain = redirURL
		isRedir = (redirDomain != originalDomain)
	
	return originalDomain, redirDomain, isRedir
	
publicSuffixFile = 'publicsuffix.txt'		
scheme = "dbname=ts_analysis"
fileName = "test.txt"
limit = "1000"

pages = {}
publicSuffixes = loadPublicSuffixes(publicSuffixFile)	

db = PageDB(scheme)
for page in db.get_pages(where_clause = "", limit = limit, ordered = False):
	originalURL = page.url
	locale = page.locale
	url_id = page.page_id[1]
	result = page.result
	detail = page.detail
	html = page.html_content
	userContent =  page.text_content
	dom_stats = page.dom_stats
	depth = len(dom_stats.tags_at_depth)
	# userContentFeatures, domFeatures = getHtmlFeatures(html)
	redirURL = page.redir_url
	originalDomain, redirDomain, isRedir = getDomainRedir(originalURL, redirURL)

	print(len(pages))