def main(): ss = Preferences() torrentlist = bencode.decode_from_file(ss.get("utresumedat")) partiallist = [] # set up an empty container for desired data to get put into for later fileguarduseless = torrentlist.pop(b".fileguard",None) rec = torrentlist.pop(b"rec",None) #Remove this. #(dict. comprehension expects only dicts as the root keys) #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup={base64.b16encode(value[b"info"]):[value[b"path"],value[b"caption"],origkey] for origkey,value in torrentlist.items()} for thehash,value in reverselookup.items(): partiallist.append([value[0].decode('utf-8'),value[1].decode('utf-8'),thehash.decode('utf-8')]) #Those 3 lines replace all of this: # for key,value in torrentlist.items(): # sentinel = False # reset before each while-loop # if b"path" in value: # path = value[b"path"].decode('utf-8') # if b"caption" in value: # caption = value[b"caption"].decode('utf-8') # if b"info" in value: # infoHash = base64.b16encode(value[b"info"]).decode('utf-8') # sentinel = True # need this because theres other dictionaries INside each file-entries' dict... # # and this will trigger the partiallist.append to write only file-entry dicts. # if sentinel == True: # partiallist.append([path,caption,infoHash]) partiallist.sort() writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'w',encoding='utf-8') # write-out a text file with one entry per line. for eachline in partiallist: writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n") #path / #caption / #infohash writelistfile.close()
def main(): ss = Preferences() torrentlist = bencode.decode_from_file(ss.get("utresumedat")) partiallist = [ ] # set up an empty container for desired data to get put into for later fileguarduseless = torrentlist.pop(".fileguard", None) rec = torrentlist.pop("rec", None) #Remove this. #(dict. comprehension expects only dicts as the root keys) #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup = { base64.b16encode(value["info"]): [value["path"], value["caption"], origkey] for origkey, value in torrentlist.iteritems() } for thehash, value in reverselookup.iteritems(): partiallist.append([value[0], value[1], thehash]) partiallist.sort() writelistfile = open( os.path.join(ss.get("maindir"), "TorrentList.txt"), 'wb') # write-out a text file with one entry per line. for eachline in partiallist: writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n") #path / #caption / #infohash writelistfile.close() print "Finished writing: TorrentList.txt"
def main(): ss = Preferences() # settings.1py directory_path = os.path.join( ss.get("maindir"), u"All-Torrs\\" ) # needs a unicode symbol so os. commands work at all on paths with funny chars files = [ os.path.join(directory_path, fn) for fn in next(os.walk(directory_path))[2] ] # gives absolute paths + names torrentnamelist = [] for eachfile in files: with open(eachfile, "rb") as stringfile: try: torrent = bencode.decode(stringfile.read()) for key, value in torrent.iteritems(): if key == "announce": announce = value domain = "{uri.netloc}".format(uri=urlparse(announce)) colon = domain.find(":", 0) if colon != -1: domain = domain[:colon] if domain: tracker = domain # only using 1 value here(lazy) elif key == "announce-list": tracker = "Multiple Trackers" except: tracker = "None" torrentfilename = eachfile[eachfile.rfind("\\") + 1 :] if not os.path.exists(directory_path + tracker): os.makedirs(directory_path + tracker) os.rename(eachfile, os.path.join(directory_path + tracker + "\\" + torrentfilename))
def main(): ss = Preferences() torrentlist = bencode.decode_from_file(ss.get("utresumedat")) partiallist = [] # set up an empty container for desired data to get put into for later fileguarduseless = torrentlist.pop(".fileguard",None) rec = torrentlist.pop("rec",None) #Remove this. #(dict. comprehension expects only dicts as the root keys) #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup={base64.b16encode(value["info"]):[value["path"],value["caption"],origkey] for origkey,value in torrentlist.iteritems()} for thehash,value in reverselookup.iteritems(): partiallist.append([value[0],value[1],thehash]) partiallist.sort() writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'wb') # write-out a text file with one entry per line. for eachline in partiallist: writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n") #path / #caption / #infohash writelistfile.close() print "Finished writing: TorrentList.txt"
def main(): ss = Preferences() #settings.1py directory_path = os.path.join( ss.get("maindir"), u"All-Torrs\\" ) #needs a unicode symbol so os. commands work at all on paths with funny chars files = [ os.path.join(directory_path, fn) for fn in next(os.walk(directory_path))[2] ] #gives absolute paths + names torrentnamelist = [] for eachfile in files: with open(eachfile, 'rb') as stringfile: try: torrent = bencode.decode(stringfile.read()) for key, value in torrent.iteritems(): if key == "announce": announce = value domain = '{uri.netloc}'.format(uri=urlparse(announce)) colon = domain.find(':', 0) if colon != -1: domain = domain[:colon] if domain: tracker = domain #only using 1 value here(lazy) elif key == "announce-list": tracker = "Multiple Trackers" except: tracker = "None" torrentfilename = eachfile[eachfile.rfind("\\") + 1:] if not os.path.exists(directory_path + tracker): os.makedirs(directory_path + tracker) os.rename( eachfile, os.path.join(directory_path + tracker + "\\" + torrentfilename))
def _calc_terminal_scores(self, w): """ Calculate the score for each possible terminal/token match """ # First pass: for each token, find the possible terminals that # can correspond to that token finals = defaultdict(set) tokens = dict() self._find_options(w, finals, tokens) # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token scores = dict() # Loop through the indices of the tokens spanned by this tree for i in range(w.start, w.end): s = finals[i] # Initially, each alternative has a score of 0 scores[i] = {terminal: 0 for terminal in s} #print("Reducing token '{0}'; scores dict initialized to:\n{1}".format(tokens[i].t1, scores[i])) if len(s) <= 1: # No ambiguity to resolve here continue # More than one terminal in the option set for the token at index i # Calculate the relative scores # Find out whether the first part of all the terminals are the same same_first = len(set(terminal.first for terminal in s)) == 1 txt = tokens[i].lower # No need to check preferences if the first parts of all possible terminals are equal # Look up the preference ordering from Reynir.conf, if any prefs = None if same_first else Preferences.get(txt) found_pref = False sc = scores[i] if prefs: adj_worse = defaultdict(int) adj_better = defaultdict(int) for worse, better, factor in prefs: for wt in s: if wt.first in worse: for bt in s: if wt is not bt and bt.first in better: if bt.name[0] in "\"'": # Literal terminal: be even more aggressive in promoting it adj_w = -2 * factor adj_b = +6 * factor else: adj_w = -2 * factor adj_b = +4 * factor adj_worse[wt] = min(adj_worse[wt], adj_w) adj_better[bt] = max(adj_better[bt], adj_b) found_pref = True for wt, adj in adj_worse.items(): #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(wt, adj, txt)) sc[wt] += adj for bt, adj in adj_better.items(): #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(bt, adj, txt)) sc[bt] += adj #if not same_first and not found_pref: # # Only display cases where there might be a missing pref # print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s)) # Apply heuristics to each terminal that potentially matches this token for t in s: tfirst = t.first if tfirst == "ao" or tfirst == "eo": # Subtract from the score of all ao and eo sc[t] -= 1 elif tfirst == "no": if t.is_singular: # Add to singular nouns relative to plural ones sc[t] += 1 elif t.is_abbrev: # Punish abbreviations in favor of other more specific terminals sc[t] -= 1 elif tfirst == "fs": if t.has_variant("nf"): # Reduce the weight of the 'artificial' nominative prepositions # 'næstum', 'sem', 'um' sc[t] -= 5 # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5) elif txt == "við" and t.has_variant("þgf"): sc[t] += 1 # Smaller bonus for við + þgf (is rarer than við + þf) elif txt == "sem" and t.has_variant("þf"): sc[t] -= 6 # Even less attractive than sem_nf else: # Else, give a bonus for each matched preposition sc[t] += 2 elif tfirst == "so": if t.variant(0) in "012": # Consider verb arguments # Normally, we give a bonus for verb arguments: the more matched, the better numcases = int(t.variant(0)) adj = 2 * numcases # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd' if numcases == 0: # Zero arguments: we might not like this if all((m.stofn not in VerbObjects.VERBS[0]) and ( "MM" not in m.beyging) for m in tokens[i].t2 if m.ordfl == "so"): # No meaning where the verb has zero arguments adj = -5 # Apply score adjustments for verbs with particular object cases, # as specified by $score(n) pragmas in Verbs.conf # In the (rare) cases where there are conflicting scores, # apply the most positive adjustment adjmax = 0 for m in tokens[i].t2: if m.ordfl == "so": key = m.stofn + t.verb_cases score = VerbObjects.SCORES.get(key) if score is not None: adjmax = score break sc[t] += adj + adjmax if t.is_sagnb: # We like sagnb and lh, it means that more # than one piece clicks into place sc[t] += 6 elif t.is_lh: # sagnb is preferred to lh, but vb (veik beyging) is discouraged if t.has_variant("vb"): sc[t] -= 2 else: sc[t] += 3 elif t.is_mm: # Encourage mm forms. The encouragement should be better than # the score for matching a single case, so we pick so_0_mm # rather than so_1_þgf, for instance. sc[t] += 3 elif t.is_vh: # Encourage vh forms sc[t] += 2 if t.is_subj: # Give a small bonus for subject matches if t.has_variant("none"): # ... but a punishment for subj_none sc[t] -= 3 else: sc[t] += 1 if t.is_nh: if (i > 0) and any(pt.first == 'nhm' for pt in finals[i - 1]): # Give a bonus for adjacent nhm + so_nh terminals sc[t] += 4 # Prop up the verb terminal with the nh variant for pt in scores[i - 1].keys(): if pt.first == 'nhm': # Prop up the nhm terminal scores[i - 1][pt] += 2 # print("Propping up nhm for verb {1}, score is now {0}".format(scores[i-1][pt], tokens[i].t1)) break if any(pt.first == "no" and pt.has_variant("ef") and pt.is_plural for pt in s): # If this is a so_nh and an alternative no_ef_ft exists, choose this one # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða') sc[t] += 4 elif tfirst == "tala" or tfirst == "töl": # A complete 'töl' or 'no' is better (has more info) than a rough 'tala' if tfirst == "tala": sc[t] -= 1 # Discourage possessive ('ef') meanings for numbers for pt in s: if (pt.first == "no" or pt.first == "töl") and pt.has_variant("ef"): sc[pt] -= 1 elif tfirst == "sérnafn": if not tokens[i].t2: # If there are no BÍN meanings, we had no choice but to use sérnafn, # so alleviate some of the penalty given by the grammar sc[t] += 2 else: # BÍN meanings are available: discourage this #print("sérnafn '{0}': BÍN meanings available, discouraging".format(tokens[i].t1)) sc[t] -= 6 if i == w.start: # First token in sentence, and we have BÍN meanings: # further discourage this sc[t] -= 4 #print("Meanings for sérnafn {0}:".format(tokens[i].t1)) #for m in tokens[i].t2: # print("{0}".format(m)) # if m.stofn[0].isupper(): # sc[t] -= 4 # Discourage 'sérnafn' if an uppercase BÍN meaning is available # break elif t.name[0] in "\"'": # Give a bonus for exact or semi-exact matches sc[t] += 1 #for i in range(w.start, w.end): # print("At token '{0}' scores dict is:\n{1}".format(tokens[i].t1, scores[i])) return scores
def go_with_score(self, forest): """ Returns the argument forest after pruning it down to a single tree """ if forest is None: return (None, 0) w = forest # First pass: for each token, find the possible terminals that # can correspond to that token finals = defaultdict(set) tokens = dict() self._find_options(w, finals, tokens) # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token scores = dict() # Loop through the indices of the tokens spanned by this tree for i in range(w.start, w.end): s = finals[i] # Initially, each alternative has a score of 0 scores[i] = { terminal: 0 for terminal in s } if len(s) > 1: # More than one terminal in the option set # Calculate the relative scores # Find out whether the first part of all the terminals are the same same_first = len(set(x.first for x in s)) == 1 txt = tokens[i].lower # No need to check preferences if the first parts of all possible terminals are equal # Look up the preference ordering from Reynir.conf, if any prefs = None if same_first else Preferences.get(txt) found_pref = False sc = scores[i] if prefs: for worse, better, factor in prefs: for wt in s: if wt.first in worse: for bt in s: if wt is not bt and bt.first in better: if bt.name[0] in "\"'": # Literal terminal: be even more aggressive in promoting it sc[wt] -= 2 * factor sc[bt] += 6 * factor else: sc[wt] -= 2 * factor sc[bt] += 4 * factor found_pref = True #if not same_first and not found_pref: # # Only display cases where there might be a missing pref # print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s)) # Apply heuristics to each terminal that potentially matches this token for t in s: tfirst = t.first if tfirst == "ao" or tfirst == "eo": # Subtract from the score of all ao and eo sc[t] -= 1 elif tfirst == "no": if t.is_singular: # Add to singular nouns relative to plural ones sc[t] += 1 elif t.is_abbrev: # Punish abbreviations in favor of other more specific terminals sc[t] -= 1 elif tfirst == "fs": if t.has_variant("nf"): # Reduce the weight of the 'artificial' nominative prepositions # 'næstum', 'sem', 'um' sc[t] -= 3 # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5) else: # Else, give a bonus for each matched preposition sc[t] += 2 elif tfirst == "so": if t.variant(0) in "012": # Consider verb arguments # Normally, we give a bonus for verb arguments: the more matched, the better adj = 2 * int(t.variant(0)) # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd' if adj == 0: # Zero arguments: we might not like this for m in tokens[i].t2: if m.ordfl == "so" and m.stofn not in VerbObjects.VERBS[0]: # We're using a verb with zero arguments but that form is not # explicitly listed in Verbs.conf: discourage this # print("Discouraging zero-arg use of verb '{0}' (stem '{1}')".format(txt, m.stofn)) adj = -1 break sc[t] += adj if t.is_sagnb: # We like sagnb and lh, it means that more # than one piece clicks into place sc[t] += 4 elif t.is_lh: # sagnb is preferred to lh, but vb (veik beyging) is discouraged if t.has_variant("vb"): sc[t] -= 2 else: sc[t] += 3 if t.is_subj: # Give a small bonus for subject matches if t.has_variant("none"): # ... but a punishment for subj_none sc[t] -= 3 else: sc[t] += 1 if t.is_nh: if (i > 0) and any(pt.first == 'nhm' for pt in finals[i - 1]): # Give a bonus for adjacent nhm + so_nh terminals sc[t] += 2 # Prop up the verb terminal with the nh variant for pt in scores[i - 1].keys(): if pt.first == 'nhm': # Prop up the nhm terminal scores[i - 1][pt] += 2 if any(pt.first == "no" and pt.has_variant("ef") and pt.is_plural for pt in s): # If this is a so_nh and an alternative no_ef_ft exists, choose this one # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða') sc[t] += 2 elif tfirst == "tala" or tfirst == "töl": # A complete 'töl' or 'no' is better (has more info) than a rough 'tala' if tfirst == "tala": sc[t] -= 1 # Discourage possessive ('ef') meanings for numbers for pt in s: if (pt.first == "no" or pt.first == "töl") and pt.has_variant("ef"): sc[pt] -= 1 elif tfirst == "sérnafn": if tokens[i].t2: sc[t] -= 20 # Base penalty is -20 for m in tokens[i].t2: sc[t] -= 1 # Subtract one for each BÍN meaning available if m.stofn[0].isupper(): sc[t] -= 8 # Heavily discourage 'sérnafn' if an uppercase BÍN meaning is available elif t.name[0] in "\"'": # Give a bonus for exact or semi-exact matches sc[t] += 1 # Third pass: navigate the tree bottom-up, eliminating lower-rated # options (subtrees) in favor of higher rated ones score = self._reduce(w, scores) return (w, score)
def main(): ss = Preferences() newfile = open(os.path.join(ss.get("maindir"), "NEWDAT.dat"), 'wb') namesandhashfile = open( ss.getwpath("outpath3"), 'r', encoding='utf-8').readlines() #("3propernames.txt") beforeafterpath = ss.getwpath( "outpath4" ) #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt) #torrentlist = decoder.decode_from_file(ss.get("utresumedat")) #works 10.645s 12315181 function calls #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls torrentlist = bencode.decode_from_file( ss.get("utresumedat")) #works 8.057ss 10908143 function calls #These two things interfere with the processing on the next line fileguarduseless = torrentlist.pop(b".fileguard", None) rec = torrentlist.pop(b"rec", None) #Remove this. #(dict. comprehension expects only dicts as the root keys) #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup = { base64.b16encode(value[b"info"]): [key, value[b"caption"], value[b"path"]] for key, value in torrentlist.items() } listofbeforeafter = [] #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile for eachline in namesandhashfile: nameandhash = eachline.strip().split( ' / ' ) #strip out the \n with strip() and split on the " / " i put there as a seperator. theNewname = nameandhash[0] thehash = nameandhash[1] #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if bytes(thehash, 'utf-8') in reverselookup: key = reverselookup[bytes(thehash, 'utf-8')][0] theOldPath = torrentlist[key][b"path"].decode('utf-8') theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname) if theOldPath != theNewPath: listofbeforeafter.append( [theOldPath, theNewPath, thehash] ) # make a list of a list (stringtoOutputtoFile=[0], hash=[1]) #sort, then write file detailing changes to path (before / after) listofbeforeafter.sort() beforeafterfile = open(beforeafterpath, 'w', encoding='utf-8') for eachline in listofbeforeafter: beforeafterfile.write( eachline[0] + " / " + eachline[2] + "\n" ) #write oldpath + hash on 1st line /The hash is duplicated for error checking in case the user accidentally bungles a character while editing... beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n") #write newpath + hash on 2nd line / beforeafterfile.close() #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file input("Press Enter to begin Renaming files.......\\> " ) #wait for the user to press Enter before continuing with anything. #WRITE TORRENT RESUME.DAT beforeafterfile = open(beforeafterpath, 'r', encoding='utf-8').readlines() for i in range(0, len(beforeafterfile), 2): beforeandhash = beforeafterfile[i].strip().split(' / ') afterandhash = beforeafterfile[i + 1].strip().split(' / ') before = beforeandhash[0] beforehash = beforeandhash[1] after = afterandhash[0] afterhash = afterandhash[1] if beforehash == afterhash: thehash = beforehash else: print( "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries." ) print( "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..." ) print( "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator." ) #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if bytes(thehash, 'utf-8') in reverselookup: key = reverselookup[bytes(thehash, 'utf-8')][0] torrentlist[key][b"caption"] = bytes(after[after.rfind("\\") + 1:], 'utf-8') try: # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines). (tip:) to show incremental numbers use (((i+1)/2)+1) # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying) print(i, before.encode('ascii', errors='ignore').decode()) print(i + 1, after.encode('ascii', errors='ignore').decode()) os.rename(before, after) except Exception as e: traceback.print_exc( ) #will output any errors to console but keep going torrentlist[key][b"path"] = bytes(after, 'utf-8') if after.endswith(".mp3") or after.endswith( ".flac" ): #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner if b"targets" in torrentlist[ key]: #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough torrentlist[key][b"targets"][0][1] = torrentlist[key][ b"caption"] #single-file-mode torrents have a "targets" list that controls the filename torrentlist[ b"rec"] = rec #add the thing we removed back in so we dont break anything (not sure what this is) #fileguard does not need to go back, in fact, purposefully needs to stay out. #newfile.write(encode.encode(torrentlist)) #works 10.295s 15361310 function calls #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow 31.872s 12452142 function calls #newfile.write(bencode2en.bencode4(torrentlist)) #works 7.864s 10906619 function calls newfile.write( bencode.bencode(torrentlist)) #works 7.699s 10906619 function calls newfile.close() print( "\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations." ) print("Finished writing: ", newfile.name)
def _calc_terminal_scores(self, w): """ Calculate the score for each possible terminal/token match """ # First pass: for each token, find the possible terminals that # can correspond to that token finals = defaultdict(set) tokens = dict() self._find_options(w, finals, tokens) # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token scores = dict() noun_prefs = NounPreferences.DICT # Loop through the indices of the tokens spanned by this tree for i in range(w.start, w.end): s = finals[i] # Initially, each alternative has a score of 0 scores[i] = {terminal: 0 for terminal in s} if len(s) <= 1: # No ambiguity to resolve here continue # More than one terminal in the option set for the token at index i # Calculate the relative scores # Find out whether the first part of all the terminals are the same same_first = len(set(terminal.first for terminal in s)) == 1 txt = tokens[i].lower # Get the last part of a composite word (e.g. 'jaðar-áhrifin' -> 'áhrifin') txt_last = txt.rsplit('-', maxsplit=1)[-1] # No need to check preferences if the first parts of all possible terminals are equal # Look up the preference ordering from Reynir.conf, if any prefs = None if same_first else Preferences.get(txt_last) sc = scores[i] if prefs: adj_worse = defaultdict(int) adj_better = defaultdict(int) for worse, better, factor in prefs: for wt in s: if wt.first in worse: for bt in s: if wt is not bt and bt.first in better: if bt.name[0] in "\"'": # Literal terminal: be even more aggressive in promoting it adj_w = -2 * factor adj_b = +6 * factor else: adj_w = -2 * factor adj_b = +4 * factor adj_worse[wt] = min(adj_worse[wt], adj_w) adj_better[bt] = max(adj_better[bt], adj_b) for wt, adj in adj_worse.items(): sc[wt] += adj for bt, adj in adj_better.items(): sc[bt] += adj # Apply heuristics to each terminal that potentially matches this token for t in s: tfirst = t.first if tfirst == "ao" or tfirst == "eo": # Subtract from the score of all ao and eo sc[t] -= 1 elif tfirst == "no": if t.is_singular: # Add to singular nouns relative to plural ones sc[t] += 1 elif t.is_abbrev: # Punish abbreviations in favor of other more specific terminals sc[t] -= 1 if tokens[i].is_upper and tokens[i].is_word and tokens[ i].t2: # Punish connection of normal noun terminal to # an uppercase word that can be a person or entity name if any(m.fl in {"ism", "föð", "móð", "örn", "fyr"} for m in tokens[i].t2): # logging.info("Punishing connection of {0} with 'no' terminal".format(tokens[i].t1)) sc[t] -= 5 # Noun priorities, i.e. between different genders # of the same word form # (for example "ára" which can refer to three stems with different genders) if txt_last in noun_prefs: np = noun_prefs[txt_last].get(t.gender, 0) sc[t] += np elif tfirst == "fs": if t.has_variant("nf"): # Reduce the weight of the 'artificial' nominative prepositions # 'næstum', 'sem', 'um' sc[t] -= 8 # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5) elif txt == "við" and t.has_variant("þgf"): sc[t] += 1 # Smaller bonus for við + þgf (is rarer than við + þf) elif txt == "sem" and t.has_variant("þf"): sc[t] -= 4 elif txt == "á" and t.has_variant("þgf"): sc[t] += 4 # Larger bonus for á + þgf to resolve conflict with verb 'eiga' else: # Else, give a bonus for each matched preposition sc[t] += 2 elif tfirst == "so": if t.num_variants > 0 and t.variant(0) in "012": # Consider verb arguments # Normally, we give a bonus for verb arguments: the more matched, the better numcases = int(t.variant(0)) adj = 2 * numcases # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd' if numcases == 0: # Zero arguments: we might not like this vo0 = VerbObjects.VERBS[0] if all( (m.stofn not in vo0) and (m.ordmynd not in vo0) and ("MM" not in m.beyging) for m in tokens[i].t2 if m.ordfl == "so"): # No meaning where the verb has zero arguments # print("Subtracting 5 points for 0-arg verb {0}".format(tokens[i].t1)) adj = -5 # Apply score adjustments for verbs with particular object cases, # as specified by $score(n) pragmas in Verbs.conf # In the (rare) cases where there are conflicting scores, # apply the most positive adjustment adjmax = 0 for m in tokens[i].t2: if m.ordfl == "so": key = m.stofn + t.verb_cases score = VerbObjects.SCORES.get(key) if score is not None: adjmax = score break sc[t] += adj + adjmax if t.is_sagnb: # We like sagnb and lh, it means that more # than one piece clicks into place sc[t] += 6 elif t.is_lh: # sagnb is preferred to lh, but vb (veik beyging) is discouraged if t.has_variant("vb"): sc[t] -= 2 else: sc[t] += 3 elif t.is_lh_nt: sc[t] += 12 # Encourage LHNT rather than LO elif t.is_mm: # Encourage mm forms. The encouragement should be better than # the score for matching a single case, so we pick so_0_mm # rather than so_1_þgf, for instance. sc[t] += 3 elif t.is_vh: # Encourage vh forms sc[t] += 2 if t.is_subj: # Give a small bonus for subject matches if t.has_variant("none"): # ... but a punishment for subj_none sc[t] -= 3 else: sc[t] += 1 if t.is_nh: if (i > 0) and any(pt.first == 'nhm' for pt in finals[i - 1]): # Give a bonus for adjacent nhm + so_nh terminals sc[t] += 4 # Prop up the verb terminal with the nh variant for pt in scores[i - 1].keys(): if pt.first == 'nhm': # Prop up the nhm terminal scores[i - 1][pt] += 2 break if any(pt.first == "no" and pt.has_variant("ef") and pt.is_plural for pt in s): # If this is a so_nh and an alternative no_ef_ft exists, choose this one # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða') sc[t] += 4 if (i > 0) and tokens[i].is_upper: # The token is uppercase and not at the start of a sentence: # discourage it from being a verb sc[t] -= 4 elif tfirst == "tala": if t.has_variant("ef"): # Try to avoid interpreting plain numbers as possessives sc[t] -= 4 elif tfirst == "person": if t.has_variant("nf"): # Prefer person names in the nominative case sc[t] += 2 elif tfirst == "sérnafn": if not tokens[i].t2: # If there are no BÍN meanings, we had no choice but to use sérnafn, # so alleviate some of the penalty given by the grammar sc[t] += 4 else: # BÍN meanings are available: discourage this # print(f"Discouraging sérnafn {txt}, BÍN meanings are {tokens[i].t2}") sc[t] -= 10 if i == w.start: # First token in sentence, and we have BÍN meanings: # further discourage this sc[t] -= 6 elif tfirst == "fyrirtæki": # We encourage company names to be interpreted as such, # so we give company abbreviations ('hf.', 'Corp.', 'Limited') # a high priority sc[t] += 24 elif tfirst == "st" or (tfirst == "sem" and t.colon_cat == "st"): if txt == "sem": # Discourage "sem" as a pure conjunction (samtenging) # (it does not get a penalty when occurring as # a connective conjunction, 'stt') sc[t] -= 6 elif tfirst == "abfn": # If we have number and gender information with the reflexive # pronoun, that's good: encourage it sc[t] += 6 if t.num_variants > 1 else 2 elif tfirst == "gr": # Encourage separate definite article rather than pronoun sc[t] += 2 elif t.name[0] in "\"'": # Give a bonus for exact or semi-exact matches sc[t] += 1 return scores
def main(): ss = Preferences() newfile = open(os.path.join(ss.get("maindir"),"NEWDAT.dat"),'wb') namesandhashfile = open(ss.getwpath("outpath3"),'r',encoding='utf-8').readlines() #("3propernames.txt") beforeafterpath = ss.getwpath("outpath4") #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt) #torrentlist = decoder.decode_from_file(ss.get("utresumedat")) #works 10.645s 12315181 function calls #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls torrentlist = bencode.decode_from_file(ss.get("utresumedat")) #works 8.057ss 10908143 function calls #These two things interfere with the processing on the next line fileguarduseless = torrentlist.pop(b".fileguard",None) rec = torrentlist.pop(b"rec",None) #Remove this. #(dict. comprehension expects only dicts as the root keys) #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup={base64.b16encode(value[b"info"]):[key,value[b"caption"],value[b"path"]] for key,value in torrentlist.items()} listofbeforeafter = [] #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile for eachline in namesandhashfile: nameandhash = eachline.strip().split(' / ') #strip out the \n with strip() and split on the " / " i put there as a seperator. theNewname = nameandhash[0] thehash = nameandhash[1] #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if bytes(thehash,'utf-8') in reverselookup: key = reverselookup[bytes(thehash,'utf-8')][0] theOldPath = torrentlist[key][b"path"].decode('utf-8') theNewPath = os.path.join(os.path.dirname(theOldPath),theNewname) if theOldPath != theNewPath: listofbeforeafter.append([theOldPath,theNewPath,thehash]) # make a list of a list (stringtoOutputtoFile=[0], hash=[1]) #sort, then write file detailing changes to path (before / after) listofbeforeafter.sort() beforeafterfile = open(beforeafterpath,'w',encoding='utf-8') for eachline in listofbeforeafter: beforeafterfile.write(eachline[0] + " / " + eachline[2] + "\n") #write oldpath + hash on 1st line /The hash is duplicated for error checking in case the user accidentally bungles a character while editing... beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n") #write newpath + hash on 2nd line / beforeafterfile.close() #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file input("Press Enter to begin Renaming files.......\\> ") #wait for the user to press Enter before continuing with anything. #WRITE TORRENT RESUME.DAT beforeafterfile = open(beforeafterpath,'r',encoding='utf-8').readlines() for i in range(0, len(beforeafterfile), 2): beforeandhash = beforeafterfile[i].strip().split(' / ') afterandhash = beforeafterfile[i+1].strip().split(' / ') before = beforeandhash[0] beforehash = beforeandhash[1] after = afterandhash[0] afterhash = afterandhash[1] if beforehash == afterhash: thehash = beforehash else: print("Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries.") print("Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it...") print("Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator.") #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if bytes(thehash,'utf-8') in reverselookup: key = reverselookup[bytes(thehash,'utf-8')][0] torrentlist[key][b"caption"] = bytes(after[after.rfind("\\")+1:],'utf-8') try: # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines). (tip:) to show incremental numbers use (((i+1)/2)+1) # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying) print(i,before.encode('ascii', errors='ignore').decode()) print(i+1,after.encode('ascii', errors='ignore').decode()) os.rename(before, after) except Exception as e: traceback.print_exc() #will output any errors to console but keep going torrentlist[key][b"path"] = bytes(after,'utf-8') if after.endswith(".mp3") or after.endswith(".flac"): #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner if b"targets" in torrentlist[key]: #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough torrentlist[key][b"targets"][0][1] = torrentlist[key][b"caption"] #single-file-mode torrents have a "targets" list that controls the filename torrentlist[b"rec"]=rec #add the thing we removed back in so we dont break anything (not sure what this is) #fileguard does not need to go back, in fact, purposefully needs to stay out. #newfile.write(encode.encode(torrentlist)) #works 10.295s 15361310 function calls #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow 31.872s 12452142 function calls #newfile.write(bencode2en.bencode4(torrentlist)) #works 7.864s 10906619 function calls newfile.write(bencode.bencode(torrentlist)) #works 7.699s 10906619 function calls newfile.close() print("\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations.") print("Finished writing: ", newfile.name)
def main(): ss = Preferences() oldfile = open(ss.get("utresumedat"), "rb").read() newfile = open(os.path.join(ss.get("maindir"), u"NEWDAT.dat"), "wb") namesandhashfile = open(ss.getwpath("outpath3"), "rb").readlines() beforeafterpath = ss.getwpath( "outpath4" ) # this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt) torrentlist = bencode.bdecode(oldfile) # These two things interfere with the processing on the next line fileguarduseless = torrentlist.pop(".fileguard", None) rec = torrentlist.pop("rec", None) # Remove this. # (dict. comprehension expects only dicts as the root keys) # create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-) reverselookup = { base64.b16encode(value["info"]): [key, value["caption"], value["path"]] for key, value in torrentlist.iteritems() } listofbeforeafter = [] # to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile for eachline in namesandhashfile: nameandhash = eachline.strip().split( " / " ) # strip out the \n with strip() and split on the " / " i put there as a seperator. theNewname = nameandhash[0] thehash = nameandhash[1] # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if thehash in reverselookup: key = reverselookup[thehash][0] theOldPath = torrentlist[key]["path"] theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname) if theOldPath != theNewPath: listofbeforeafter.append( [theOldPath, theNewPath, thehash] ) # make a list of a list (stringtoOutputtoFile=[0], hash=[1]) # sort, then write file detailing changes to path (before / after) listofbeforeafter.sort() beforeafterfile = open(beforeafterpath, "wb") for eachline in listofbeforeafter: try: beforeafterfile.write( eachline[0] + " / " + eachline[2] + "\n" ) # write oldpath + hash on 1st line /The hash is duplicated for error checking in case the user accidentally bungles a character while editing... beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n") # write newpath + hash on 2nd line / except: print "Error writing the before+after file, probably a encoding/unicode error: \n", eachline[ 0 ], "\n", eachline[1] print "This was a fatal error and program could not continue." return beforeafterfile.close() # At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file raw_input( "Press Enter to begin Renaming files.......\\> " ) # wait for the user to press Enter before continuing with anything. # WRITE TORRENT RESUME.DAT beforeafterfile = open(beforeafterpath, "rb").readlines() for i in xrange(0, len(beforeafterfile), 2): beforeandhash = beforeafterfile[i].strip().split(" / ") afterandhash = beforeafterfile[i + 1].strip().split(" / ") before = beforeandhash[0].decode("utf-8") beforehash = beforeandhash[1] after = afterandhash[0].decode("utf-8") afterhash = afterandhash[1] if beforehash == afterhash: thehash = beforehash else: print "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries." print "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..." print "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator." # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid] if thehash in reverselookup: key = reverselookup[thehash][0] torrentlist[key]["caption"] = after[after.rfind("\\") + 1 :] try: # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines). (tip:) to show incremental numbers use (((i+1)/2)+1) # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying) print i, before.encode("ascii", errors="ignore") print i + 1, after.encode("ascii", errors="ignore") os.rename(before, after) except Exception as e: traceback.print_exc() # will output any errors to console but keep going torrentlist[key]["path"] = after if after.endswith(".mp3") or after.endswith( ".flac" ): # .mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner if torrentlist[key].has_key( "targets" ): # these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough torrentlist[key]["targets"][0][1] = after[ after.rfind("\\") + 1 : ] # single-file-mode torrents have a "targets" list that controls the filename torrentlist["rec"] = rec # add the thing we removed back in so we dont break anything (not sure what this is) # fileguard does not need to go back, in fact, purposefully needs to stay out. newfile.write(bencode.bencode(torrentlist)) newfile.close() print "Finished writing: ", newfile.name