def generateEntries(list, printErrors=True): """Generates array of tuples (anglicism, wiktionary-link).""" tuples = [] # Array for tuples (anglicism, wiktionary-link) errors = [] for e in list: percentage = list.index(e)*1.0/len(list)*100 wil("Creating tuples of anglicisms and their wikilink -" "%.2f%% complete" %(percentage), 60) try: anglicism = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", e) if anglicism == []: continue # Extracting the anglicisms anglicism = anglicism[0].replace("<", "").replace(">", "") wikilink = "" if "(Seite nicht vorhanden)" not in str(e): # Extracting the wikilink wikilink = re.findall('=".+"\s', e)[0].replace('="', "").replace('" ', "") wikilink = "http://de.wiktionary.org" + wikilink tuples.append((anglicism, wikilink)) except Exception, ex: errors.append((str(e), str(ex))) continue finally:
def extractDictEntries(lines, printErrors=True): """Extracts dictionary entries and returns an array of tuples.""" # You can find the entries in the dict.cc-file in following form: # Entry {specification} Additional Entry <Abbreviation> # [Comment1] [Comment2] [...] wordtype # Many of these parts are optional or depend on the word class. tuples = [] # Array of tuples in form of (German DictEntry object, # english DictEntry object, word class) errors = [] for i in xrange(len(lines)): percentage = i*1.0/len(lines)*100 wil("Extracting and generating dictionary entries - " "%.2f%% complete" %(percentage), 50) try: # Seperates german part, english part and word class _entries = re.split("\t", lines[i]) german_parts = extractParts(_entries[0]) english_parts = extractParts(_entries[1]) germanEntry = (german_parts[0], german_parts[1], german_parts[2], german_parts[3], german_parts[4]) englishEntry = (english_parts[0], english_parts[1], english_parts[2], english_parts[3], english_parts[4]) tuples.append((germanEntry, englishEntry, _entries[2]. replace("\n", ""))) except Exception, e: errors.append("%s with line %s" %(e, lines[i])) continue finally:
def readVectorFile(word_list, vectors_file, filter=True): """Reads a vector file.""" w("Reading VectorFile %s..." %(vectors_file)) D = {} fl() with codecs.open(vectors_file, "r", "utf-8") as fin: for i, line in enumerate(fin): wil("Reading VectorFile %s - %i lines so far%s" %(vectors_file, i+1), 20) vector = [x for x in line.split() if x] # die ersten zwei zeilen einer word2vec vector-datei kann man ignorieren. # sie sind nur ein paar informationen zur anzahl der vektoren und # der vektor </s> (anzahl der zeilen im korpus) if i > 1: # hier werden diejenigen vektoren rausgefiltert, die nicht # im wörterbuch sind if filter: if vector[0] in word_list: D[vector[0]] = [float(x) for x in vector[1:]] del word_list[word_list.index(vector[0])] elif not filter: D[vector[0]] = [float(x) for x in vector[1:]] fl() wil("Reading VectorFile %s...Complete!%s\n" %(vectors_file), 30) return D
def readTupleFile(input_file, separation_character="\t"): """Reads a tuple file. Tuples are separated by separation_character.""" lines = readFile(input_file) tuples = [] for line in lines: percentage = lines.index(line)*1.0/len(lines)*100.0 wil("Reading tuple file %s - %.2f%% complete" %(input_file, percentage), 30) line = line.replace("\n", "") parts = line.split(separation_character) tuples.append(tuple(parts)) fl() wil("Reading tuple file %s...Complete!" %(input_file), 30, "\n") return tuples
def writeTupleFile(tuples, output_file, separation_character="\t", printErrors=True): """Enhanced function for writing a tuple file.""" file = codecs.open(output_file, "w", "utf-8") for i in xrange(len(tuples)): percentage = i*1.0/len(tuples)*100 wil("Writing file %s - %.2f%% complete" %(output_file, percentage), 50) try: file.write(unicode(tuples[i][0]) + separation_character + unicode(tuples[i][1]) + "\n") except Exception, ex: if printErrors: w("%s: %s" %(str(ex), str(tuples[i]))) continue finally:
def main(): cl() wh("\t\tCrOssinG: CompaRing Of AngliciSmS IN German", 75) dictionary = readDictionary("../res/dictEntries.txt") anglicisms = readTupleFile("../res/anglicisms.txt") devectors = pickle.load(open("../res/DE_VEC.bin")) envectors = pickle.load(open("../res/EN_VEC.bin")) false_friends = readTupleFile("../res/false_friends.txt") alphas = [0.0001, 0.0002, 0.001, 0.002, 0.01, 0.02, 0.1, 0.2] models = ["ridge", "net", "Lasso"] model_paras = [(model, alpha) for model in models for alpha in alphas] i = 1 w("Creating VectorTransformators...\n") vm = VectorManager.VectorTransformator() vm.Dictionary = dictionary vm.V = devectors vm.W = envectors for tuple_ in model_paras: vm.createTransformationMatrix(tuple_[0], tuple_[1]) w("VectorTransformator Nr. %i with Model=%s and alpha=%g has been" " created\n" %(i, tuple_[0], tuple_[1])) i += 1 w("Creating VectorTransformators...Complete!\n\n") models = vm.Models top_model = compareMatrices(false_friends, vm, models, devectors, envectors) w("\nChecking the quality of an evaluation with false-friend-pairs...\n") true_count = 0 false_count = 0 n_tests = 100 for i in range(n_tests): wil("False friend test nr. %i" %(i+1)) res = falseFriendsCheck(false_friends, vm, top_model, devectors,\ envectors, dictionary, 50, False) if res: true_count += 1 elif not res: false_count += 1 w("\nIn %i out of %i times, a random subset had a lower or equal average"\ "similarity than a random false friend subset.\n" %(false_count, n_tests))
def readTupleFileToDict(input_file, dicttype, separation_character="\t"): """Reads a tuple file into a dictionary.""" # Tuples are separated by separation_character lines = readFile(input_file) dict_ = {} dict_ = defaultdict() LENGTH = len(lines) for i in xrange(LENGTH): percentage = (i*1.0/LENGTH*100.0) wil("Reading tuple file %s and creating dictionary -%.2f%% complete%s" %(input_file, percentage), 30) line = lines[i].replace("\n", "") parts = line.split(separation_character) if isinstance(dicttype, int): dict_.setdefault(parts[0], int(parts[1])) elif isinstance(dicttype, basestring): dict_.setdefault(unicode(parts[0]), unicode(parts[1])) fl() wil("Reading tuple file %s and creating dictionary...Complete!%s\n" %(input_file), 30) return dict_
def randomSubset(array, n, output=True): if output: w("Creating random subset...") if isinstance(array, dict): # conversion to array of tuples keys = array.keys() values = array.values() length = len(array) array = [(keys[i], values[i]) for i in xrange(length)] res = [] while len(res) != n: if output: percentage = len(res)*1.0/n*100 wil("Creating random subset - %.2f%% complete" %(percentage)) ri = random.randint(0, len(array)-1) res.append(array[ri]) if output: fl() if output: wil("Creating random subset...Complete!", 50, "\n") return res
def getAnglicismsList(url): """Extracts a list of anglicisms from a wiktionary page.""" anglicisms_list_html = BS(urllib2.urlopen(url)) # Extract the html-code # Extracting every relevant section from the html-code sections = anglicisms_list_html.find_all("p") wil("Extracting anglicisms from wictionary.", 30) entries = [] # Array for anglicisms for section in sections: # The many variants of seperators section_ = re.split("( - | – | -|- |– )", str(section)) for s in section_: entries.append(s) entries = entries[3:len(entries)-1] # Using only the relevant parts fl() wil("Extracting anglicisms from wictionary..") for i in range(len(entries)-1, -1, -1): if entries[i] in [" - ", "- ", " -", " – ", "– "]: entries.pop(i) # Popping redundant matches fl() wil("Extracting anglicisms from wictionary...Complete!", 30, "\n") return entries
def readFile(filename, ignore_character="##########", onestring=False): """Reads a file.""" # ignore_character for leaving out redundant lines wil("Reading file %s" %(filename)) file = codecs.open(filename, "r", "utf-8") lines = [] line = file.readline() count = 0 fl() while line != "": wil("Reading File %s - %i lines so far" %(filename, count), 20) if not line.startswith(ignore_character): lines.append(line) line = file.readline() count += 1 fl() wil("Reading file %s...Complete!" %(filename), 30, "\n") if onestring: # If result should be one string instead an array of strings onestring = "" for i in xrange(len(lines)): onestring += lines[i] + " " return onestring return lines
def lookUpTranslations(list, printErrors=True): """Looks up the English translation of an anglicism.""" # Array for tuples with format (anglzism, [translation 1, translation2]) tuples = [] for e in list: percentage = list.index(e)*1.0/len(list)*100 wil("Looking up translations for %s - %.2f%% complete" %(e[0].replace("ä", "ae").replace("é", "e"), percentage), 20) if e[1] == "": # If there is no wikilink fl() continue try: # Extracting the html-code of wiktionary-page r = urllib2.Request(e[1]) html = BS(urllib2.urlopen(r)) # If there are English translations if len(re.findall("/wiki/Englisch.+<\/li>", str(html))) > 0: translations = re.findall("/wiki/Englisch.+<\/li>", unicode(html))[0] translations = re.findall(">[0-9a-zA-Z-. äöüÄÖÜßé]+<", translations) for i in range(len(translations)-1, -1, -1): if translations[i] == "> <" or \ translations[i] == ">Englisch<": translations.pop(i) # Popping redundant matches... else: # ...or just formatting the results translations[i] = translations[i].replace(">", "").replace("<", "") else: translations = [] # Default tuples.append((e[0].decode('utf-8'), translations)) except Exception, ex: if printErrors: print str(ex) fl()
def extractFalseFriends(lines): """Extracts false friends from .txt file.""" wil("Extracting False Friends...") tuples = [] array = ["" for i in range(4)] entry_index = 0 for l in lines: if l == "\n": # Reset entry_index = 0 array = ["" for i in range(4)] elif entry_index == 3: array[entry_index] = l.replace("\n", "").replace("\t", "") tuples.append(tuple(array)) # Reset entry_index = 0 array = ["" for i in range(4)] else: array[entry_index] = l.replace("\n", "").replace("\t", "") entry_index += 1 fl() wil("Extracting False Friends...Complete!\n") return tuples
wikilink = "" if "(Seite nicht vorhanden)" not in str(e): # Extracting the wikilink wikilink = re.findall('=".+"\s', e)[0].replace('="', "").replace('" ', "") wikilink = "http://de.wiktionary.org" + wikilink tuples.append((anglicism, wikilink)) except Exception, ex: errors.append((str(e), str(ex))) continue finally: fl() if printErrors == True: wil("The following errors occured:", 150, "\n") for error in errors: print "Error at entry: %s - %s" %(error[0], error[1]) wil("Creating tuples of anglicisms and their wikilinks...Complete!", 30, "\n") return tuples def lookUpTranslations(list, printErrors=True): """Looks up the English translation of an anglicism.""" # Array for tuples with format (anglzism, [translation 1, translation2]) tuples = [] for e in list: percentage = list.index(e)*1.0/len(list)*100 wil("Looking up translations for %s - %.2f%% complete"
german_parts = extractParts(_entries[0]) english_parts = extractParts(_entries[1]) germanEntry = (german_parts[0], german_parts[1], german_parts[2], german_parts[3], german_parts[4]) englishEntry = (english_parts[0], english_parts[1], english_parts[2], english_parts[3], english_parts[4]) tuples.append((germanEntry, englishEntry, _entries[2]. replace("\n", ""))) except Exception, e: errors.append("%s with line %s" %(e, lines[i])) continue finally: fl() wil("Extracting dictionary entries...Complete!", 90, "\n") if printErrors: w("The following errors occurred:\n") for error in errors: w(error) return tuples def extractParts(dict_string): """Extracts the different parts of an entry.""" entry_array = [] # Main entry specification = "" # Specification, e.g. numerus or gender additional_entry_array = [] # Additional Entry abbr = "" # Abbreviation comments = [] # Array of comments entry_end = False # To determine whether the main entry ended already # Splitting with whitspaces; connecting parts in brackets
"""Enhanced function for writing a tuple file.""" file = codecs.open(output_file, "w", "utf-8") for i in xrange(len(tuples)): percentage = i*1.0/len(tuples)*100 wil("Writing file %s - %.2f%% complete" %(output_file, percentage), 50) try: file.write(unicode(tuples[i][0]) + separation_character + unicode(tuples[i][1]) + "\n") except Exception, ex: if printErrors: w("%s: %s" %(str(ex), str(tuples[i]))) continue finally: fl() wil("Writing file %s...Complete!" %(output_file), 50, "\n") file.close() def dumpObject(self, obj, name=None): """Takes an object as an argument and dumps its content on disk using specified "name" as its file name. If no file name is specified, that object's __repr__ will be used instead. """ try: with open(name, "wb") as output_file: pickle.dump(obj, output_file, -1) print "Successfully dumped " + obj + " into " + name + "." except IOError: with open(repr(obj), "wb") as output_file: pickle.dump(obj, output_file, -1) print "Successfully dumped " + obj + \