Esempio n. 1
def read_ids(idfilename):
    with as idfile:
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)

            identry = identry[0]
            if len(identry) < 2:
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
def read_ids(idfilename): 
    with as idfile: 
        for line in idfile:
            line = line.strip()

            regex = "(\d+)\,\s+(.*)"
            identry = re.findall(regex, line)
            identry = identry[0]
            if len(identry) < 2: 
            myid = identry[0]
            myentry = identry[1]

            id_to_name[myid] = myentry
            name_to_id[myentry] = myid
    p("Read all ids", "info")
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
    print "\n[RUN]: \n"\
    "python \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"

hidden_cat = dict()         #Dictionary to check all hidden category names
hiddencnt = 0               #Counter to see how many hidden categories are read
hiddencat_found = False     #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hiddencnt += 1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
def find_grades(categoryinfofilename, categoriesoutputfilename): 
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict()    # Dictionary to keep track of the children to each parent cat
    subgraph = dict() # Dictionary to keep track of the parents of each cat
    # Creating category graph
    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"): #children
                child = name_to_id[line[2:]]
                if parent == "":
                if parent in graph:
                    if child not in graph[parent]: 
                    graph[parent] = [child]

                if child in subgraph: 
                    if parent in subgraph[child]: 
                        a = 0
                    subgraph[child] = [parent]

                line = line.replace("_", " ")
                parent = name_to_id[line]
    p("Finished reading all info [Time: %s sec]" %(time.time()-starttime), "info")
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""
    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph: 
        if len(graph[category])> maxchildren: 
            maxchildren = len(graph[category])
            mchildren = category
        outlinks+= len(graph[category])
        grades[category] = [len(graph[category])]

        if category in subgraph:
            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent: 
                maxparent = len(subgraph[category])
                mparent = category
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
                #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph: 
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
        #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()
thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]
Esempio n. 6
import gzip, json, yaml, io
from myprint import myprint as p
Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" % (enmappingfilename), "info")
with, "rb") as enmappingfile:
    for line in enmappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle] = [pageid]

latestversion = 5  # Latest version, update for later versions
    "\t [category-info.txt]\n"\
    "\t [article-info.txt.gz]\n"\
    "\t [article-output.txt.gz] \n\n"\
    "Create the complete paths of the articles. \n"

startcategory = "Main topic classifications" #"Fundamental Categories"
startcategory = startcategory.lower()
parent = ""
graph = dict()    #Dictionary to keep track of the children to each parent cat
subgraph = dict() #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"): #children
            child_name = line[2:]
            child = idmapper.name_to_id(child_name)
            if parent == "":
            if parent in graph:
             #   if child not in graph[parent]:
                graph[parent] = [child]
Esempio n. 8
    "python \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"

hiddencat_id = dict(
)  #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict(
)  #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
Esempio n. 9
    titlefilename = sys.argv[1]
    outputtitlesfilename = sys.argv[2]
    print "\n[RUN]: \n"\
    "python \n"\
    "\t [enwiki-latest-redirects.sql.gz]\n"\
    "\t [output-redirect-titles.txt.gz]\n\n"\
    "Find all titles that redirecs \n"

redirects = dict()  #Dictionary for keeping all redirect pages
starttime = time.time()

# Reads the redictfile
p("Reading all redirect titles", "info")
with as titlefile:
    for line in titlefile:
        if line.startswith("INSERT"):
            line.decode('utf-8', 'ignore')
            line_split = line[30:] #.split("VALUES (")[1]
            insertions = line_split.split("),(")
            for insertion in insertions:

                # Code for encoding
                    insertion = insertion.decode('unicode-escape')
                except SyntaxError:
                    insertion = insertion.decode('ascii')
                except Exception,e:
                    a = 0
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.

categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()

p("Reading %s" %(inputfilename), "info")
with, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
        nopages[splittet[0]] = splittet[1]

p("Finished reading %s" %(inputfilename), "info")
p("Reading %s" %(redirectinputfilename), "info")
    "python \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()     #Dictionary to keep track of all hidden categories
links = dict()          #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception,e:
            a = 0

            line = unidecode(line)
        except UnicodeEncodeError, e:
Esempio n. 12
import gzip, re
from myprint import myprint as p
from unidecode import unidecode
Program for finding the pageid for an entry.
Needs to do the same process as the mapper so that the entries are identical.

categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

redirects = dict()
pageid_to_title = dict()
pagetitle_to_id = dict()
p("Reading %s" %(inputfilename), "info")
with, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 
            #print "< 2: " + line
        nopages[splittet[0]] = splittet[1]

p("Finished reading %s" %(inputfilename), "info")
p("Reading %s" % (redirectinputfilename), "info")
    print "\n[RUN]: \n"\
    "python \n"\
    "\t [enwiki-latest-page_props.sql.gz]\n"\
    "\t [enwiki-latest-page.sql.gz\n"\
    "\t [All_hidden_categories.txt.gz]\n\n"\
    "[FUNC:] Find all hidden categories in the categorylinks file and then in the page_props file and combine all of these to one big file. \n"

hiddencat_id = dict()   #Dictionary to keep track of the ids of all hidden categories
hidden_cat = dict()     #Dictionary to keep track of the name of all hidden categories
hiddencnt = 0
start_time = time.time()

# Looping through page props to find the ids of all hidden categories
p("Finding all hidden category ids from page props...", "info")
with as inputfile:
    for line in inputfile:
        if (line.startswith("INSERT")):
            line = line.split("VALUES (")[1]
            line.decode('utf-8', 'ignore')
            insertions = line.split("),(")
            for insertion in insertions:
                insertion = insertion.lower()
                words = insertion.split(",")
                if "hiddencat" in words[1].lower():
                    # Hidden category is found, id i added
                    hiddencat_id[words[0]] = 1
                    hiddencnt += 1  # Counter to keep track of number of hidden categories
mytime = time.time() - start_time
p("Found %d hidden category ids (%s min, %s min) ---" %(hiddencnt,  mytime/60, mytime%60), "info")
Esempio n. 14
class Path(object): 
    def __init__(self, score, path): 
        self.score = score
        self.path = path
    def __cmp__(self, other): 
        return cmp(other.score, self.score)
letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with, "rb") as gradefile: 
    for line in gradefile: 
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with, "rb") as idfile: 
    for line in idfile: 
        line = line.strip()
Esempio n. 15
def find_grades(categoryinfofilename, categoriesoutputfilename):
    global grades
    global C_in
    global C_out
    global avg_in
    global avg_out
    starttime = time.time()
    begintime = starttime
    p("Reading all category info", "info")
    parent = child = ""
    graph = dict(
    )  # Dictionary to keep track of the children to each parent cat
    subgraph = dict()  # Dictionary to keep track of the parents of each cat

    # Creating category graph

    with open(categoryinfofilename) as categorygraph:
        for line in categorygraph:
            line = line.strip()
            if line.startswith("*"):  #children
                child = name_to_id[line[2:]]
                if parent == "":
                if parent in graph:
                    if child not in graph[parent]:
                    graph[parent] = [child]

                if child in subgraph:
                    if parent in subgraph[child]:
                        a = 0
                    subgraph[child] = [parent]

                line = line.replace("_", " ")
                parent = name_to_id[line]

    p("Finished reading all info [Time: %s sec]" % (time.time() - starttime),
    maxparent = maxchildren = outlinks = inlinks = 0
    mparent = mchildren = ""

    C_in = len(subgraph)
    C_out = len(subgraph)
    for category in graph:
        if len(graph[category]) > maxchildren:
            maxchildren = len(graph[category])
            mchildren = category
        outlinks += len(graph[category])

        grades[category] = [len(graph[category])]

        if category in subgraph:


            inlinks += len(subgraph[category])
            if len(subgraph[category]) > maxparent:
                maxparent = len(subgraph[category])
                mparent = category
                if (len(graph[category]) > 10) and (len(subgraph[category]) > 10): 
                cnt10+= 1
            if (len(graph[category]) > 20) and (len(subgraph[category]) > 20): 
                cnt20+= 1
            if (len(graph[category]) > 30) and (len(subgraph[category]) > 30): 
                cnt30+= 1
            if (len(graph[category]) > 40) and (len(subgraph[category]) > 40): 
                cnt40+= 1
            if (len(graph[category]) > 50) and (len(subgraph[category]) > 50): 
                cnt50+= 1
            if (len(graph[category]) > 60) and (len(subgraph[category]) > 60): 
                cnt60+= 1
            if (len(graph[category]) > 70) and (len(subgraph[category]) > 70): 
                cnt70+= 1
            if (len(graph[category]) > 80) and (len(subgraph[category]) > 80): 
                cnt80+= 1
            if (len(graph[category]) > 90) and (len(subgraph[category]) > 90): 
                cnt90+= 1
            if (len(graph[category]) > 100) and (len(subgraph[category]) > 100): 
                cnt100+= 1
                print "category: %s, number: %d\n" %(category, len(graph[category]))
            #print "%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category]))
            #outputfile.write("%s: %d, %d\n" %(category, len(graph[category]), len(subgraph[category])))
            subgraph.pop(category, None)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
Esempio n. 16
    # Method for printing the entry, only for debugging
    def entryprint(self):
        print "Old: %s, new: %s, tierones: %s\n" %(self.oldentry, self.newentry, self.tierone) 

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True

version = 4                                     # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)        # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version+1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()                # Dictionary for storing the final results
paraentries = dict()                  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()               # Dictionary for all disambiguation titles
disambiguationentries = dict()        # Dictionary for keeping track on all disambiguation entries

yearregex = "(\d\d\d\d)"              # Regex for recognizing years in the title
parenthesisregex = "(\(.*\))"         # Regex for recognizing parenthesis in the title
Esempio n. 17
import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" %(inputfilename), "info")
with, "rb") as inputfile: 
    for line in inputfile: 
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2: 

        nopages[splittet[0]] = splittet[1]

p("Finished reading %s" %(inputfilename), "info")

p("Reading %s" %(redirectinputfilename), "info")
with, "rb") as redirectfile:
    for line in redirectfile:     
        #line = unicode(line, "utf-8")
from myprint import myprint as p

Program for a file containing all dictionary entries and what page ids they are based on.
Sorts out all relevant dictionary entries based on the latest version of the igg-iabtaxonomy

Why? Because the file containing all possible entries and their page ids is extremely large.

Input: A file containing all processed Wikipedia article pages and their corresponding ids
Output: A file continaing the relevant dictionary entries and what page ids they are based on

# Reading the English
enmappingfilename = "pageid-pagetitle-en.txt.gz"
enpagetitle_to_id = dict()
p("Reading %s" %(enmappingfilename), "info")
with, "rb") as enmappingfile: 
    for line in enmappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
        pageid = splittet[0]
        pagetitle = splittet[1]
        if pagetitle in enpagetitle_to_id:
            enpagetitle_to_id[pagetitle] = [pageid]

Esempio n. 19
from myprint import myprint as p

Program for mapping all keywords to IAB categories. 

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"]

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile: 
    for line in outputcatfile: 
        line = line.strip()
        if line.startswith("*"): 
            tierone = line[1:]
            tierone = line.lower()
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone

idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
Esempio n. 20
Program for mapping all keywords to IAB categories. 

inputfilename = "articlemapping-all.txt.gz"
outputcategoriesfilename = "Outputcategories"

letters = [
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
    "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "restfile"

outputcategories = dict()

tiertwo = ""
tierone = ""
p("Reading output categories", "info")
with open(outputcategoriesfilename, "r") as outputcatfile:
    for line in outputcatfile:
        line = line.strip()
        if line.startswith("*"):
            tierone = line[1:]
            tierone = line.lower()
            tiertwo = line
            tiertwo = line.lower()
            outputcategories[tiertwo] = tierone

idfilename = "id-mapper.txt.gz"
id_to_name = dict()
name_to_id = dict()
p("Reading id-mapper", "info")
Esempio n. 21
class Path(object):
    def __init__(self, score, path):
        self.score = score
        self.path = path

    def __cmp__(self, other):
        return cmp(other.score, self.score)

letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", \
               "t", "u", "v", "w", "x", "y", "z", "restfile"]

gradesfilename = "category-grade.txt.gz"
idfilename = "id-mapper.txt.gz"

p("Reading all grades", "info")
allgrades = dict()
grades = dict()
with, "rb") as gradefile:
    for line in gradefile:
        line = line.strip()
        splittet = line.split("\t")
        category = splittet[0]
        grade = float(splittet[1])
        grades[category] = grade

id_to_name = dict()
p("Reading all ids for all categories", "info")
with, "rb") as idfile:
    for line in idfile:
        line = line.strip()
Esempio n. 22
def is_number(input):
        return True
        return False

start_time = time.time()
categorycnt = pagecnt = lines = 0

hidden_cat = dict()
artskip = catskip = hiddencnt = 0

p("Reading all hidden categories", "info")
with as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
            hidden_cat[line] = 1
p("All hidden categories read", "info")

redirects = dict()
p("Reading all redirects", "info")
with, "rb") as redirectfile:
    for line in redirectfile:
        line.decode('utf-8', 'ignore')
        line = line.lower()
    "python \n"\
    "\t [Sub-categories-new.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"\
    "\t [Subcat-links.txt.gz\n\n"\
    "[FUNC:] Split the categorylink-file to the categories concerning pages and those concerning sub categories. Skip all hidden categories to remove number of relevant category links.\n"

allcategories = dict()  #Dictionary to keep track of all categories
hidden_cat = dict()  #Dictionary to keep track of all hidden categories
links = dict()  #Dictionary to keep track of all links in the graph
artskip = catskip = hiddencnt = 0

starttime = time.time()

# Reads all the hidden categories from the file
p("Reading all hidden categories", "info")
with, "rb") as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()

        # Code for representing the category names in same encoding
            line = line.decode('unicode-escape')
        except SyntaxError:
            line = line.decode('ascii')
        except Exception, e:
            a = 0

            line = unidecode(line)
        except UnicodeEncodeError, e:
Esempio n. 24
    "economics", "education", "environment", "form", "geography", "government",
    "health", "history", "humanities", "humans", "industry", "information",
    "intellectual works", "knowledge", "language", "law", "leisure", "life",
    "mathematics", "matter", "medicine", "mind", "nature", "people",
    "politics", "professional studies", "science", "scientific disciplines",
    "society", "sports", "structure", "systems", "technology", "thought",
    "tools", "transport", "universe", "world"
startcategory = startcategory.lower()
parent = ""
graph = dict()  #Dictionary to keep track of the children to each parent cat
subgraph = dict()  #Dictionary to keep track of the parents to each subcategory

starttime = time.time()
begintime = starttime
p("Reading all category info", "info")
parent = child = ""
with open(categoryinfofilename) as categorygraph:
    for line in categorygraph:
        line = line.strip()
        if line.startswith("*"):  #children
            child = line[2:]
            if parent == "":
            if parent in graph:
                #   if child not in graph[parent]:
                graph[parent] = [child]
        "\t [Page-categories.txt.gz]\n"\
        "\t [article-info.txt.gz]\n\n"\
    "[FUNCTION]: \n"\
    "Store all articles with their immidiate subcategories\n"


articles = dict(
)  #Dictionary to keep track of all categories and their articles
artcnt = teller = articlecnt = 0
starttime = time.time()

# Reads the file file containg links between categories and articles
p("Reading all article content...", "info")
with as articleinfo:
    for line in articleinfo:
        line = line.strip()
        lines = line.split("\t")
        if len(lines) < 2:
        category = lines[0].lower()
        page = lines[1].lower()
        if "" == page or " " == page:

        if page in articles:
            # page is already in the dictionary
            if category not in articles[page]:
                # Add the category if not present
Esempio n. 26
    categoryinputfilename = sys.argv[1]
    hiddencategoryinputfilename = sys.argv[2]
    print "\n[RUN]: \n"\
    "python \n"\
    "\t [Subcat-links.txt.gz]\n"\
    "\t [All_hidden_categories.txt.gz]\n"

hidden_cat = dict()  #Dictionary to check all hidden category names
hiddencnt = 0  #Counter to see how many hidden categories are read
hiddencat_found = False  #Boolean to determine if hidden cateogry is found within graph

# Reads all hidden categories from the file containing the names of all hidden categories
p("Reading all hidden categories", "info")
with as hiddencategories:
    for line in hiddencategories:
        line = line.strip().lower()
        if line in hidden_cat:
            hidden_cat[line] += 1
            hiddencnt += 1
            hidden_cat[line] = 1
p("All hidden categories read", "info")

# Reads the category graph
p("Read all category links", "info")
with, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p

Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" %(nomappingfilename), "info")
with, "rb") as nomappingfile: 
    for line in nomappingfile: 
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2: 
        nomapping[splittet[0]] = splittet[1]

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" %(entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile: 
        print "Old: %s, new: %s, tierones: %s\n" % (
            self.oldentry, self.newentry, self.tierone)

    # Method for comparing this object to another
    def myequal(self, obj):
        for category in obj.tierone:
            if category not in self.tierone:
                return False
        return True

version = 4  # Version of the dictionary to be cleaned

# Reading the dictionary
inputfilename = "igg-dictionary-" + str(version) + ".json"
p("Reading json file", "info")
with open(inputfilename, "rb") as inputfile:
    iggdictionary = yaml.load(inputfile)  # Loading the dictionary from file
p("finished reading python json", "info")

iggiabtaxonomy = "igg-iabtaxonomy" + str(version)
iggiabtaxonomynew = "igg-iabtaxonomy" + str(version + 1)
dictionary = iggdictionary[iggiabtaxonomy]

newdictionary = dict()  # Dictionary for storing the final results
paraentries = dict(
)  # Dictionary for keeping track on the changes made to the entries
disambiguation = dict()  # Dictionary for all disambiguation titles
disambiguationentries = dict(
)  # Dictionary for keeping track on all disambiguation entries
# -*- coding: utf-8 -*-
import gzip, json, yaml, io
from myprint import myprint as p
Program for creating a dictionary for a dictionary-based classifier
for another language (based on the English dictionary)

nomappingfilename = "no-mapping.txt.gz"

nomapping = dict()
p("Reading %s" % (nomappingfilename), "info")
with, "rb") as nomappingfile:
    for line in nomappingfile:
        line = line.strip()
        splittet = line.split("\t")
        if len(splittet) < 2:
        nomapping[splittet[0]] = splittet[1]

entrytoidfilename = "en-entry-to-pageid.json"
p("Reading %s" % (entrytoidfilename), "info")
with open(entrytoidfilename, "rb") as inputfile:
    entrypageid = yaml.load(inputfile)
p("finished reading python json", "info")

norwegianstopwords = []
stopwordfile = "norwegian_stop_words.txt"
with open(stopwordfile, "r") as inputfile:
    for line in inputfile:
Esempio n. 30
import gzip, re
from myprint import myprint as p
from unidecode import unidecode

inputfilename = "no-mapping.txt.gz"
categoryinputfilename = "enwiki-latest-categorylinks.sql.gz"
redirectinputfilename = "output-redirect-titles.txt.gz"
pagefilename = "enwiki-latest-page.sql.gz"

nopages = dict()
redirects = dict()
pages = dict()

p("Reading %s" % (inputfilename), "info")
with, "rb") as inputfile:
    for line in inputfile:
        line = line.strip()
        line = line.lower()
        splittet = line.split("\t")
        if len(splittet) < 2:

        nopages[splittet[0]] = splittet[1]

p("Finished reading %s" % (inputfilename), "info")

p("Reading %s" % (redirectinputfilename), "info")
with, "rb") as redirectfile:
    for line in redirectfile:
        #line = unicode(line, "utf-8")
Esempio n. 31
            subgraph.pop(category, None)
            #outputfile.write("Only children: %s: %d\n" %(category, len(graph[category])))
    for category in subgraph:
        grades[category] = [0, len(subgraph[category])]
        inlinks += len(subgraph[category])

    avg_in = inlinks / C_in
    avb_out = outlinks / C_out
    #outputfile.write("Only parents: %s: %d\n" %(category, len(subgraph[category])))
    #return create_grades()

thresholds = [10, 20, 30, 40, 50]
p("Number of categories with %d parent categories and subcategories: %d" %(10, cnt10), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(20, cnt20), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(30, cnt30), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(40, cnt40), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(50, cnt50), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(60, cnt60), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(70, cnt70), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(80, cnt80), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(90, cnt90), "info")
p("Number of categories with %d parent categories and subcategories: %d" %(100, cnt100), "info")

p("Maxparent: %d (%s)" %(maxparent, mparent), "info")
p("Maxchildren: %d (%s)" %(maxchildren, mchildren), "info")
#subcats = graph["2015"]