Esempio n. 1
0
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = []    # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(b".fileguard",None)
    rec = torrentlist.pop(b"rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value[b"info"]):[value[b"path"],value[b"caption"],origkey] for origkey,value in torrentlist.items()}
    for thehash,value in reverselookup.items():
        partiallist.append([value[0].decode('utf-8'),value[1].decode('utf-8'),thehash.decode('utf-8')])
    #Those 3 lines replace all of this:
    # for key,value in torrentlist.items():
    #     sentinel = False    # reset before each while-loop
    #     if b"path" in value:
    #         path = value[b"path"].decode('utf-8')
    #     if b"caption" in value:
    #         caption = value[b"caption"].decode('utf-8')
    #     if b"info" in value:
    #         infoHash = base64.b16encode(value[b"info"]).decode('utf-8')
    #         sentinel = True  # need this because theres other dictionaries INside each file-entries' dict...
    #                          # and this will trigger the partiallist.append to write only file-entry dicts.
    #     if sentinel == True:
    #         partiallist.append([path,caption,infoHash])

    partiallist.sort()
    writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'w',encoding='utf-8') # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n")
        					#path 			/	#caption		  /		#infohash
    writelistfile.close()
Esempio n. 2
0
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = [
    ]  # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(".fileguard", None)
    rec = torrentlist.pop("rec", None)  #Remove this.
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value["info"]):
        [value["path"], value["caption"], origkey]
        for origkey, value in torrentlist.iteritems()
    }
    for thehash, value in reverselookup.iteritems():
        partiallist.append([value[0], value[1], thehash])

    partiallist.sort()
    writelistfile = open(
        os.path.join(ss.get("maindir"), "TorrentList.txt"),
        'wb')  # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " +
                            eachline[2] + "\n")
        #path           /   #caption          /     #infohash
    writelistfile.close()
    print "Finished writing: TorrentList.txt"
def main():
    ss = Preferences()  # settings.1py

    directory_path = os.path.join(
        ss.get("maindir"), u"All-Torrs\\"
    )  # needs a unicode symbol so os. commands work at all on paths with funny chars

    files = [
        os.path.join(directory_path, fn) for fn in next(os.walk(directory_path))[2]
    ]  # gives absolute paths + names

    torrentnamelist = []
    for eachfile in files:
        with open(eachfile, "rb") as stringfile:
            try:
                torrent = bencode.decode(stringfile.read())
                for key, value in torrent.iteritems():
                    if key == "announce":
                        announce = value
                        domain = "{uri.netloc}".format(uri=urlparse(announce))
                        colon = domain.find(":", 0)
                        if colon != -1:
                            domain = domain[:colon]
                        if domain:
                            tracker = domain  # only using 1 value here(lazy)
                    elif key == "announce-list":
                        tracker = "Multiple Trackers"
            except:
                tracker = "None"
        torrentfilename = eachfile[eachfile.rfind("\\") + 1 :]

        if not os.path.exists(directory_path + tracker):
            os.makedirs(directory_path + tracker)
        os.rename(eachfile, os.path.join(directory_path + tracker + "\\" + torrentfilename))
def main():

    ss = Preferences()

    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))
    partiallist = []    # set up an empty container for desired data to get put into for later

    fileguarduseless = torrentlist.pop(".fileguard",None)
    rec = torrentlist.pop("rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value["info"]):[value["path"],value["caption"],origkey] for origkey,value in torrentlist.iteritems()}
    for thehash,value in reverselookup.iteritems():
        partiallist.append([value[0],value[1],thehash])

    partiallist.sort()
    writelistfile = open(os.path.join(ss.get("maindir"),"TorrentList.txt"),'wb') # write-out a text file with one entry per line.
    for eachline in partiallist:
        writelistfile.write(eachline[0] + " / " + eachline[1] + " / " + eachline[2] + "\n")
                            #path           /   #caption          /     #infohash
    writelistfile.close()
    print "Finished writing: TorrentList.txt"
def main():
    ss = Preferences()  #settings.1py

    directory_path = os.path.join(
        ss.get("maindir"), u"All-Torrs\\"
    )  #needs a unicode symbol so os. commands work at all on paths with funny chars

    files = [
        os.path.join(directory_path, fn)
        for fn in next(os.walk(directory_path))[2]
    ]  #gives absolute paths + names

    torrentnamelist = []
    for eachfile in files:
        with open(eachfile, 'rb') as stringfile:
            try:
                torrent = bencode.decode(stringfile.read())
                for key, value in torrent.iteritems():
                    if key == "announce":
                        announce = value
                        domain = '{uri.netloc}'.format(uri=urlparse(announce))
                        colon = domain.find(':', 0)
                        if colon != -1:
                            domain = domain[:colon]
                        if domain:
                            tracker = domain  #only using 1 value here(lazy)
                    elif key == "announce-list":
                        tracker = "Multiple Trackers"
            except:
                tracker = "None"
        torrentfilename = eachfile[eachfile.rfind("\\") + 1:]

        if not os.path.exists(directory_path + tracker):
            os.makedirs(directory_path + tracker)
        os.rename(
            eachfile,
            os.path.join(directory_path + tracker + "\\" + torrentfilename))
Esempio n. 6
0
    def _calc_terminal_scores(self, w):
        """ Calculate the score for each possible terminal/token match """

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = {terminal: 0 for terminal in s}

            #print("Reducing token '{0}'; scores dict initialized to:\n{1}".format(tokens[i].t1, scores[i]))

            if len(s) <= 1:
                # No ambiguity to resolve here
                continue

            # More than one terminal in the option set for the token at index i
            # Calculate the relative scores
            # Find out whether the first part of all the terminals are the same
            same_first = len(set(terminal.first for terminal in s)) == 1
            txt = tokens[i].lower
            # No need to check preferences if the first parts of all possible terminals are equal
            # Look up the preference ordering from Reynir.conf, if any
            prefs = None if same_first else Preferences.get(txt)
            found_pref = False
            sc = scores[i]
            if prefs:
                adj_worse = defaultdict(int)
                adj_better = defaultdict(int)
                for worse, better, factor in prefs:
                    for wt in s:
                        if wt.first in worse:
                            for bt in s:
                                if wt is not bt and bt.first in better:
                                    if bt.name[0] in "\"'":
                                        # Literal terminal: be even more aggressive in promoting it
                                        adj_w = -2 * factor
                                        adj_b = +6 * factor
                                    else:
                                        adj_w = -2 * factor
                                        adj_b = +4 * factor
                                    adj_worse[wt] = min(adj_worse[wt], adj_w)
                                    adj_better[bt] = max(adj_better[bt], adj_b)
                                    found_pref = True
                for wt, adj in adj_worse.items():
                    #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(wt, adj, txt))
                    sc[wt] += adj
                for bt, adj in adj_better.items():
                    #print("Token '{2}': Adjusting score of terminal '{0}' by {1}".format(bt, adj, txt))
                    sc[bt] += adj
            #if not same_first and not found_pref:
            #    # Only display cases where there might be a missing pref
            #    print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s))

            # Apply heuristics to each terminal that potentially matches this token
            for t in s:
                tfirst = t.first
                if tfirst == "ao" or tfirst == "eo":
                    # Subtract from the score of all ao and eo
                    sc[t] -= 1
                elif tfirst == "no":
                    if t.is_singular:
                        # Add to singular nouns relative to plural ones
                        sc[t] += 1
                    elif t.is_abbrev:
                        # Punish abbreviations in favor of other more specific terminals
                        sc[t] -= 1
                elif tfirst == "fs":
                    if t.has_variant("nf"):
                        # Reduce the weight of the 'artificial' nominative prepositions
                        # 'næstum', 'sem', 'um'
                        sc[t] -= 5  # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                    elif txt == "við" and t.has_variant("þgf"):
                        sc[t] += 1  # Smaller bonus for við + þgf (is rarer than við + þf)
                    elif txt == "sem" and t.has_variant("þf"):
                        sc[t] -= 6  # Even less attractive than sem_nf
                    else:
                        # Else, give a bonus for each matched preposition
                        sc[t] += 2
                elif tfirst == "so":
                    if t.variant(0) in "012":
                        # Consider verb arguments
                        # Normally, we give a bonus for verb arguments: the more matched, the better
                        numcases = int(t.variant(0))
                        adj = 2 * numcases
                        # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                        if numcases == 0:
                            # Zero arguments: we might not like this
                            if all((m.stofn not in VerbObjects.VERBS[0]) and (
                                    "MM" not in m.beyging)
                                   for m in tokens[i].t2 if m.ordfl == "so"):
                                # No meaning where the verb has zero arguments
                                adj = -5
                        # Apply score adjustments for verbs with particular object cases,
                        # as specified by $score(n) pragmas in Verbs.conf
                        # In the (rare) cases where there are conflicting scores,
                        # apply the most positive adjustment
                        adjmax = 0
                        for m in tokens[i].t2:
                            if m.ordfl == "so":
                                key = m.stofn + t.verb_cases
                                score = VerbObjects.SCORES.get(key)
                                if score is not None:
                                    adjmax = score
                                    break
                        sc[t] += adj + adjmax
                    if t.is_sagnb:
                        # We like sagnb and lh, it means that more
                        # than one piece clicks into place
                        sc[t] += 6
                    elif t.is_lh:
                        # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                        if t.has_variant("vb"):
                            sc[t] -= 2
                        else:
                            sc[t] += 3
                    elif t.is_mm:
                        # Encourage mm forms. The encouragement should be better than
                        # the score for matching a single case, so we pick so_0_mm
                        # rather than so_1_þgf, for instance.
                        sc[t] += 3
                    elif t.is_vh:
                        # Encourage vh forms
                        sc[t] += 2
                    if t.is_subj:
                        # Give a small bonus for subject matches
                        if t.has_variant("none"):
                            # ... but a punishment for subj_none
                            sc[t] -= 3
                        else:
                            sc[t] += 1
                    if t.is_nh:
                        if (i > 0) and any(pt.first == 'nhm'
                                           for pt in finals[i - 1]):
                            # Give a bonus for adjacent nhm + so_nh terminals
                            sc[t] += 4  # Prop up the verb terminal with the nh variant
                            for pt in scores[i - 1].keys():
                                if pt.first == 'nhm':
                                    # Prop up the nhm terminal
                                    scores[i - 1][pt] += 2
                                    # print("Propping up nhm for verb {1}, score is now {0}".format(scores[i-1][pt], tokens[i].t1))
                                    break
                        if any(pt.first == "no" and pt.has_variant("ef")
                               and pt.is_plural for pt in s):
                            # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                            # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                            sc[t] += 4
                elif tfirst == "tala" or tfirst == "töl":
                    # A complete 'töl' or 'no' is better (has more info) than a rough 'tala'
                    if tfirst == "tala":
                        sc[t] -= 1
                    # Discourage possessive ('ef') meanings for numbers
                    for pt in s:
                        if (pt.first == "no"
                                or pt.first == "töl") and pt.has_variant("ef"):
                            sc[pt] -= 1
                elif tfirst == "sérnafn":
                    if not tokens[i].t2:
                        # If there are no BÍN meanings, we had no choice but to use sérnafn,
                        # so alleviate some of the penalty given by the grammar
                        sc[t] += 2
                    else:
                        # BÍN meanings are available: discourage this
                        #print("sérnafn '{0}': BÍN meanings available, discouraging".format(tokens[i].t1))
                        sc[t] -= 6
                        if i == w.start:
                            # First token in sentence, and we have BÍN meanings:
                            # further discourage this
                            sc[t] -= 4
                        #print("Meanings for sérnafn {0}:".format(tokens[i].t1))
                        #for m in tokens[i].t2:
                        #    print("{0}".format(m))
                    #        if m.stofn[0].isupper():
                    #            sc[t] -= 4 # Discourage 'sérnafn' if an uppercase BÍN meaning is available
                    #            break
                elif t.name[0] in "\"'":
                    # Give a bonus for exact or semi-exact matches
                    sc[t] += 1

        #for i in range(w.start, w.end):
        #    print("At token '{0}' scores dict is:\n{1}".format(tokens[i].t1, scores[i]))
        return scores
Esempio n. 7
0
    def go_with_score(self, forest):

        """ Returns the argument forest after pruning it down to a single tree """

        if forest is None:
            return (None, 0)
        w = forest

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = { terminal: 0 for terminal in s }
            if len(s) > 1:
                # More than one terminal in the option set
                # Calculate the relative scores
                # Find out whether the first part of all the terminals are the same
                same_first = len(set(x.first for x in s)) == 1
                txt = tokens[i].lower
                # No need to check preferences if the first parts of all possible terminals are equal
                # Look up the preference ordering from Reynir.conf, if any
                prefs = None if same_first else Preferences.get(txt)
                found_pref = False
                sc = scores[i]
                if prefs:
                    for worse, better, factor in prefs:
                        for wt in s:
                            if wt.first in worse:
                                for bt in s:
                                    if wt is not bt and bt.first in better:
                                        if bt.name[0] in "\"'":
                                            # Literal terminal: be even more aggressive in promoting it
                                            sc[wt] -= 2 * factor
                                            sc[bt] += 6 * factor
                                        else:
                                            sc[wt] -= 2 * factor
                                            sc[bt] += 4 * factor
                                        found_pref = True
                #if not same_first and not found_pref:
                #    # Only display cases where there might be a missing pref
                #    print("Token '{0}' has {1} possible terminal matches: {2}".format(txt, len(s), s))

                # Apply heuristics to each terminal that potentially matches this token
                for t in s:
                    tfirst = t.first
                    if tfirst == "ao" or tfirst == "eo":
                        # Subtract from the score of all ao and eo
                        sc[t] -= 1
                    elif tfirst == "no":
                        if t.is_singular:
                            # Add to singular nouns relative to plural ones
                            sc[t] += 1
                        elif t.is_abbrev:
                            # Punish abbreviations in favor of other more specific terminals
                            sc[t] -= 1
                    elif tfirst == "fs":
                        if t.has_variant("nf"):
                            # Reduce the weight of the 'artificial' nominative prepositions
                            # 'næstum', 'sem', 'um'
                            sc[t] -= 3 # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                        else:
                            # Else, give a bonus for each matched preposition
                            sc[t] += 2
                    elif tfirst == "so":
                        if t.variant(0) in "012":
                            # Consider verb arguments
                            # Normally, we give a bonus for verb arguments: the more matched, the better
                            adj = 2 * int(t.variant(0))
                            # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                            if adj == 0:
                                # Zero arguments: we might not like this
                                for m in tokens[i].t2:
                                    if m.ordfl == "so" and m.stofn not in VerbObjects.VERBS[0]:
                                        # We're using a verb with zero arguments but that form is not
                                        # explicitly listed in Verbs.conf: discourage this
                                        # print("Discouraging zero-arg use of verb '{0}' (stem '{1}')".format(txt, m.stofn))
                                        adj = -1
                                        break
                            sc[t] += adj
                        if t.is_sagnb:
                            # We like sagnb and lh, it means that more
                            # than one piece clicks into place
                            sc[t] += 4
                        elif t.is_lh:
                            # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                            if t.has_variant("vb"):
                                sc[t] -= 2
                            else:
                                sc[t] += 3
                        if t.is_subj:
                            # Give a small bonus for subject matches
                            if t.has_variant("none"):
                                # ... but a punishment for subj_none
                                sc[t] -= 3
                            else:
                                sc[t] += 1
                        if t.is_nh:
                            if (i > 0) and any(pt.first == 'nhm' for pt in finals[i - 1]):
                                # Give a bonus for adjacent nhm + so_nh terminals
                                sc[t] += 2 # Prop up the verb terminal with the nh variant
                                for pt in scores[i - 1].keys():
                                    if pt.first == 'nhm':
                                        # Prop up the nhm terminal
                                        scores[i - 1][pt] += 2
                            if any(pt.first == "no" and pt.has_variant("ef") and pt.is_plural for pt in s):
                                # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                                # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                                sc[t] += 2
                    elif tfirst == "tala" or tfirst == "töl":
                        # A complete 'töl' or 'no' is better (has more info) than a rough 'tala'
                        if tfirst == "tala":
                            sc[t] -= 1
                        # Discourage possessive ('ef') meanings for numbers
                        for pt in s:
                            if (pt.first == "no" or pt.first == "töl") and pt.has_variant("ef"):
                                sc[pt] -= 1
                    elif tfirst == "sérnafn":
                        if tokens[i].t2:
                            sc[t] -= 20 # Base penalty is -20
                            for m in tokens[i].t2:
                                sc[t] -= 1 # Subtract one for each BÍN meaning available
                                if m.stofn[0].isupper():
                                    sc[t] -= 8 # Heavily discourage 'sérnafn' if an uppercase BÍN meaning is available
                    elif t.name[0] in "\"'":
                        # Give a bonus for exact or semi-exact matches
                        sc[t] += 1

        # Third pass: navigate the tree bottom-up, eliminating lower-rated
        # options (subtrees) in favor of higher rated ones

        score = self._reduce(w, scores)

        return (w, score)
def main():
    ss = Preferences()

    newfile = open(os.path.join(ss.get("maindir"), "NEWDAT.dat"), 'wb')
    namesandhashfile = open(
        ss.getwpath("outpath3"), 'r',
        encoding='utf-8').readlines()  #("3propernames.txt")

    beforeafterpath = ss.getwpath(
        "outpath4"
    )  #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    #torrentlist = decoder.decode_from_file(ss.get("utresumedat"))  #works   10.645s 12315181 function calls
    #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls
    torrentlist = bencode.decode_from_file(
        ss.get("utresumedat"))  #works  8.057ss 10908143 function calls

    #These two things interfere with the processing on the next line
    fileguarduseless = torrentlist.pop(b".fileguard", None)
    rec = torrentlist.pop(b"rec", None)  #Remove this.
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value[b"info"]):
        [key, value[b"caption"], value[b"path"]]
        for key, value in torrentlist.items()
    }

    listofbeforeafter = []
    #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(
            ' / '
        )  #strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash, 'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash, 'utf-8')][0]
            theOldPath = torrentlist[key][b"path"].decode('utf-8')
            theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append(
                    [theOldPath, theNewPath, thehash]
                )  # make a list of a list (stringtoOutputtoFile=[0], hash=[1])

    #sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath, 'w', encoding='utf-8')
    for eachline in listofbeforeafter:
        beforeafterfile.write(
            eachline[0] + " / " + eachline[2] + "\n"
        )  #write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
        beforeafterfile.write(eachline[1] + " / " + eachline[2] +
                              "\n")  #write newpath + hash on 2nd line   /
    beforeafterfile.close()

    #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    input("Press Enter to begin Renaming files.......\\> "
          )  #wait for the user to press Enter before continuing with anything.

    #WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath, 'r', encoding='utf-8').readlines()
    for i in range(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(' / ')
        afterandhash = beforeafterfile[i + 1].strip().split(' / ')
        before = beforeandhash[0]
        beforehash = beforeandhash[1]
        after = afterandhash[0]
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print(
                "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries."
            )
            print(
                "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..."
            )
            print(
                "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator."
            )
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash, 'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash, 'utf-8')][0]
            torrentlist[key][b"caption"] = bytes(after[after.rfind("\\") + 1:],
                                                 'utf-8')
            try:
                # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1)
                # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print(i, before.encode('ascii', errors='ignore').decode())
                print(i + 1, after.encode('ascii', errors='ignore').decode())
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc(
                )  #will output any errors to console but keep going
            torrentlist[key][b"path"] = bytes(after, 'utf-8')
            if after.endswith(".mp3") or after.endswith(
                    ".flac"
            ):  #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if b"targets" in torrentlist[
                        key]:  #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key][b"targets"][0][1] = torrentlist[key][
                        b"caption"]  #single-file-mode torrents have a "targets" list that controls the filename

        torrentlist[
            b"rec"] = rec  #add the thing we removed back in so we dont break anything (not sure what this is)
        #fileguard does not need to go back, in fact, purposefully needs to stay out.
    #newfile.write(encode.encode(torrentlist))       #works    10.295s 15361310 function calls
    #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow  31.872s 12452142 function calls
    #newfile.write(bencode2en.bencode4(torrentlist))  #works   7.864s 10906619 function calls
    newfile.write(
        bencode.bencode(torrentlist))  #works    7.699s 10906619 function calls
    newfile.close()
    print(
        "\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations."
    )
    print("Finished writing: ", newfile.name)
Esempio n. 9
0
    def _calc_terminal_scores(self, w):
        """ Calculate the score for each possible terminal/token match """

        # First pass: for each token, find the possible terminals that
        # can correspond to that token
        finals = defaultdict(set)
        tokens = dict()
        self._find_options(w, finals, tokens)

        # Second pass: find a (partial) ordering by scoring the terminal alternatives for each token
        scores = dict()
        noun_prefs = NounPreferences.DICT

        # Loop through the indices of the tokens spanned by this tree
        for i in range(w.start, w.end):

            s = finals[i]
            # Initially, each alternative has a score of 0
            scores[i] = {terminal: 0 for terminal in s}

            if len(s) <= 1:
                # No ambiguity to resolve here
                continue

            # More than one terminal in the option set for the token at index i
            # Calculate the relative scores
            # Find out whether the first part of all the terminals are the same
            same_first = len(set(terminal.first for terminal in s)) == 1
            txt = tokens[i].lower
            # Get the last part of a composite word (e.g. 'jaðar-áhrifin' -> 'áhrifin')
            txt_last = txt.rsplit('-', maxsplit=1)[-1]
            # No need to check preferences if the first parts of all possible terminals are equal
            # Look up the preference ordering from Reynir.conf, if any
            prefs = None if same_first else Preferences.get(txt_last)
            sc = scores[i]
            if prefs:
                adj_worse = defaultdict(int)
                adj_better = defaultdict(int)
                for worse, better, factor in prefs:
                    for wt in s:
                        if wt.first in worse:
                            for bt in s:
                                if wt is not bt and bt.first in better:
                                    if bt.name[0] in "\"'":
                                        # Literal terminal: be even more aggressive in promoting it
                                        adj_w = -2 * factor
                                        adj_b = +6 * factor
                                    else:
                                        adj_w = -2 * factor
                                        adj_b = +4 * factor
                                    adj_worse[wt] = min(adj_worse[wt], adj_w)
                                    adj_better[bt] = max(adj_better[bt], adj_b)
                for wt, adj in adj_worse.items():
                    sc[wt] += adj
                for bt, adj in adj_better.items():
                    sc[bt] += adj

            # Apply heuristics to each terminal that potentially matches this token
            for t in s:
                tfirst = t.first
                if tfirst == "ao" or tfirst == "eo":
                    # Subtract from the score of all ao and eo
                    sc[t] -= 1
                elif tfirst == "no":
                    if t.is_singular:
                        # Add to singular nouns relative to plural ones
                        sc[t] += 1
                    elif t.is_abbrev:
                        # Punish abbreviations in favor of other more specific terminals
                        sc[t] -= 1
                    if tokens[i].is_upper and tokens[i].is_word and tokens[
                            i].t2:
                        # Punish connection of normal noun terminal to
                        # an uppercase word that can be a person or entity name
                        if any(m.fl in {"ism", "föð", "móð", "örn", "fyr"}
                               for m in tokens[i].t2):
                            # logging.info("Punishing connection of {0} with 'no' terminal".format(tokens[i].t1))
                            sc[t] -= 5
                    # Noun priorities, i.e. between different genders
                    # of the same word form
                    # (for example "ára" which can refer to three stems with different genders)
                    if txt_last in noun_prefs:
                        np = noun_prefs[txt_last].get(t.gender, 0)
                        sc[t] += np
                elif tfirst == "fs":
                    if t.has_variant("nf"):
                        # Reduce the weight of the 'artificial' nominative prepositions
                        # 'næstum', 'sem', 'um'
                        sc[t] -= 8  # Make other cases outweigh the Nl_nf bonus of +4 (-2 -3 = -5)
                    elif txt == "við" and t.has_variant("þgf"):
                        sc[t] += 1  # Smaller bonus for við + þgf (is rarer than við + þf)
                    elif txt == "sem" and t.has_variant("þf"):
                        sc[t] -= 4
                    elif txt == "á" and t.has_variant("þgf"):
                        sc[t] += 4  # Larger bonus for á + þgf to resolve conflict with verb 'eiga'
                    else:
                        # Else, give a bonus for each matched preposition
                        sc[t] += 2
                elif tfirst == "so":
                    if t.num_variants > 0 and t.variant(0) in "012":
                        # Consider verb arguments
                        # Normally, we give a bonus for verb arguments: the more matched, the better
                        numcases = int(t.variant(0))
                        adj = 2 * numcases
                        # !!! Logic should be added here to encourage zero arguments for verbs in 'miðmynd'
                        if numcases == 0:
                            # Zero arguments: we might not like this
                            vo0 = VerbObjects.VERBS[0]
                            if all(
                                (m.stofn not in vo0) and (m.ordmynd not in vo0)
                                    and ("MM" not in m.beyging)
                                    for m in tokens[i].t2 if m.ordfl == "so"):
                                # No meaning where the verb has zero arguments
                                # print("Subtracting 5 points for 0-arg verb {0}".format(tokens[i].t1))
                                adj = -5
                        # Apply score adjustments for verbs with particular object cases,
                        # as specified by $score(n) pragmas in Verbs.conf
                        # In the (rare) cases where there are conflicting scores,
                        # apply the most positive adjustment
                        adjmax = 0
                        for m in tokens[i].t2:
                            if m.ordfl == "so":
                                key = m.stofn + t.verb_cases
                                score = VerbObjects.SCORES.get(key)
                                if score is not None:
                                    adjmax = score
                                    break
                        sc[t] += adj + adjmax
                    if t.is_sagnb:
                        # We like sagnb and lh, it means that more
                        # than one piece clicks into place
                        sc[t] += 6
                    elif t.is_lh:
                        # sagnb is preferred to lh, but vb (veik beyging) is discouraged
                        if t.has_variant("vb"):
                            sc[t] -= 2
                        else:
                            sc[t] += 3
                    elif t.is_lh_nt:
                        sc[t] += 12  # Encourage LHNT rather than LO
                    elif t.is_mm:
                        # Encourage mm forms. The encouragement should be better than
                        # the score for matching a single case, so we pick so_0_mm
                        # rather than so_1_þgf, for instance.
                        sc[t] += 3
                    elif t.is_vh:
                        # Encourage vh forms
                        sc[t] += 2
                    if t.is_subj:
                        # Give a small bonus for subject matches
                        if t.has_variant("none"):
                            # ... but a punishment for subj_none
                            sc[t] -= 3
                        else:
                            sc[t] += 1
                    if t.is_nh:
                        if (i > 0) and any(pt.first == 'nhm'
                                           for pt in finals[i - 1]):
                            # Give a bonus for adjacent nhm + so_nh terminals
                            sc[t] += 4  # Prop up the verb terminal with the nh variant
                            for pt in scores[i - 1].keys():
                                if pt.first == 'nhm':
                                    # Prop up the nhm terminal
                                    scores[i - 1][pt] += 2
                                    break
                        if any(pt.first == "no" and pt.has_variant("ef")
                               and pt.is_plural for pt in s):
                            # If this is a so_nh and an alternative no_ef_ft exists, choose this one
                            # (for example, 'hafa', 'vera', 'gera', 'fara', 'mynda', 'berja', 'borða')
                            sc[t] += 4
                    if (i > 0) and tokens[i].is_upper:
                        # The token is uppercase and not at the start of a sentence:
                        # discourage it from being a verb
                        sc[t] -= 4
                elif tfirst == "tala":
                    if t.has_variant("ef"):
                        # Try to avoid interpreting plain numbers as possessives
                        sc[t] -= 4
                elif tfirst == "person":
                    if t.has_variant("nf"):
                        # Prefer person names in the nominative case
                        sc[t] += 2
                elif tfirst == "sérnafn":
                    if not tokens[i].t2:
                        # If there are no BÍN meanings, we had no choice but to use sérnafn,
                        # so alleviate some of the penalty given by the grammar
                        sc[t] += 4
                    else:
                        # BÍN meanings are available: discourage this
                        # print(f"Discouraging sérnafn {txt}, BÍN meanings are {tokens[i].t2}")
                        sc[t] -= 10
                        if i == w.start:
                            # First token in sentence, and we have BÍN meanings:
                            # further discourage this
                            sc[t] -= 6
                elif tfirst == "fyrirtæki":
                    # We encourage company names to be interpreted as such,
                    # so we give company abbreviations ('hf.', 'Corp.', 'Limited')
                    # a high priority
                    sc[t] += 24
                elif tfirst == "st" or (tfirst == "sem"
                                        and t.colon_cat == "st"):
                    if txt == "sem":
                        # Discourage "sem" as a pure conjunction (samtenging)
                        # (it does not get a penalty when occurring as
                        # a connective conjunction, 'stt')
                        sc[t] -= 6
                elif tfirst == "abfn":
                    # If we have number and gender information with the reflexive
                    # pronoun, that's good: encourage it
                    sc[t] += 6 if t.num_variants > 1 else 2
                elif tfirst == "gr":
                    # Encourage separate definite article rather than pronoun
                    sc[t] += 2
                elif t.name[0] in "\"'":
                    # Give a bonus for exact or semi-exact matches
                    sc[t] += 1

        return scores
def main():
    ss = Preferences()

    newfile = open(os.path.join(ss.get("maindir"),"NEWDAT.dat"),'wb')
    namesandhashfile = open(ss.getwpath("outpath3"),'r',encoding='utf-8').readlines()       #("3propernames.txt")

    beforeafterpath = ss.getwpath("outpath4")   #this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    #torrentlist = decoder.decode_from_file(ss.get("utresumedat"))  #works   10.645s 12315181 function calls
    #torrentlist = bencode2en.decode_from_file(ss.get("utresumedat")) #works 8.462s 13745202 function calls
    torrentlist = bencode.decode_from_file(ss.get("utresumedat"))  #works  8.057ss 10908143 function calls

    #These two things interfere with the processing on the next line 
    fileguarduseless = torrentlist.pop(b".fileguard",None)
    rec = torrentlist.pop(b"rec",None)   #Remove this. 
    #(dict. comprehension expects only dicts as the root keys)
    #create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup={base64.b16encode(value[b"info"]):[key,value[b"caption"],value[b"path"]] for key,value in torrentlist.items()}

    listofbeforeafter = []
    #to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile   
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(' / ')   #strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash,'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash,'utf-8')][0]
            theOldPath = torrentlist[key][b"path"].decode('utf-8')
            theNewPath = os.path.join(os.path.dirname(theOldPath),theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append([theOldPath,theNewPath,thehash])   # make a list of a list (stringtoOutputtoFile=[0], hash=[1])            

    #sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath,'w',encoding='utf-8')
    for eachline in listofbeforeafter:
        beforeafterfile.write(eachline[0] + " / " + eachline[2] + "\n")         #write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
        beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n")         #write newpath + hash on 2nd line   /
    beforeafterfile.close()

    #At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    input("Press Enter to begin Renaming files.......\\> ")  #wait for the user to press Enter before continuing with anything.

    #WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath,'r',encoding='utf-8').readlines()
    for i in range(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(' / ')
        afterandhash = beforeafterfile[i+1].strip().split(' / ')
        before = beforeandhash[0]
        beforehash = beforeandhash[1]
        after = afterandhash[0]
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print("Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries.")
            print("Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it...")
            print("Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator.")
        #searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if bytes(thehash,'utf-8') in reverselookup:
            key = reverselookup[bytes(thehash,'utf-8')][0]
            torrentlist[key][b"caption"] = bytes(after[after.rfind("\\")+1:],'utf-8')
            try:
               # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1) 
               # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print(i,before.encode('ascii', errors='ignore').decode())
                print(i+1,after.encode('ascii', errors='ignore').decode())
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc()       #will output any errors to console but keep going
            torrentlist[key][b"path"] = bytes(after,'utf-8')
            if after.endswith(".mp3") or after.endswith(".flac"):     #.mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if b"targets" in torrentlist[key]:                     #these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key][b"targets"][0][1] = torrentlist[key][b"caption"]           #single-file-mode torrents have a "targets" list that controls the filename

        torrentlist[b"rec"]=rec   #add the thing we removed back in so we dont break anything (not sure what this is)
                                #fileguard does not need to go back, in fact, purposefully needs to stay out.
    #newfile.write(encode.encode(torrentlist))       #works    10.295s 15361310 function calls
    #newfile.write(bencode2en.bencode2(torrentlist)) #v.slow  31.872s 12452142 function calls
    #newfile.write(bencode2en.bencode4(torrentlist))  #works   7.864s 10906619 function calls
    newfile.write(bencode.bencode(torrentlist))     #works    7.699s 10906619 function calls
    newfile.close()
    print("\nPlease note that the filenames shown are missing any unicode characters due to Windows Command Prompt limitations.")
    print("Finished writing: ", newfile.name)
def main():
    ss = Preferences()

    oldfile = open(ss.get("utresumedat"), "rb").read()
    newfile = open(os.path.join(ss.get("maindir"), u"NEWDAT.dat"), "wb")
    namesandhashfile = open(ss.getwpath("outpath3"), "rb").readlines()

    beforeafterpath = ss.getwpath(
        "outpath4"
    )  # this holds the intermediate changes to happen before actually renaming so you have a chance to edit/change it. (4beforepath-afterpath.txt)

    torrentlist = bencode.bdecode(oldfile)

    # These two things interfere with the processing on the next line
    fileguarduseless = torrentlist.pop(".fileguard", None)
    rec = torrentlist.pop("rec", None)  # Remove this.
    # (dict. comprehension expects only dicts as the root keys)
    # create a reverse lookup dict with "Dict comprehension". nice and simple eh? ;-)
    reverselookup = {
        base64.b16encode(value["info"]): [key, value["caption"], value["path"]]
        for key, value in torrentlist.iteritems()
    }

    listofbeforeafter = []
    # to modify paths in reverse lookup dict, start by getting the names and hash out of the namesandhashfile
    for eachline in namesandhashfile:
        nameandhash = eachline.strip().split(
            " / "
        )  # strip out the \n with strip() and split on the " / " i put there as a seperator.
        theNewname = nameandhash[0]
        thehash = nameandhash[1]
        # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if thehash in reverselookup:
            key = reverselookup[thehash][0]
            theOldPath = torrentlist[key]["path"]
            theNewPath = os.path.join(os.path.dirname(theOldPath), theNewname)
            if theOldPath != theNewPath:
                listofbeforeafter.append(
                    [theOldPath, theNewPath, thehash]
                )  # make a list of a list (stringtoOutputtoFile=[0], hash=[1])

    # sort, then write file detailing changes to path (before / after)
    listofbeforeafter.sort()
    beforeafterfile = open(beforeafterpath, "wb")
    for eachline in listofbeforeafter:
        try:
            beforeafterfile.write(
                eachline[0] + " / " + eachline[2] + "\n"
            )  # write oldpath + hash on 1st line    /The hash is duplicated for error checking in case the user accidentally bungles a character while editing...
            beforeafterfile.write(eachline[1] + " / " + eachline[2] + "\n")  # write newpath + hash on 2nd line   /
        except:
            print "Error writing the before+after file, probably a encoding/unicode error: \n", eachline[
                0
            ], "\n", eachline[1]
            print "This was a fatal error and program could not continue."
            return
    beforeafterfile.close()

    # At this point the script pauses, and asks the user to confirm changes shown in the beforepath-afterpath.txt file
    raw_input(
        "Press Enter to begin Renaming files.......\\> "
    )  # wait for the user to press Enter before continuing with anything.

    # WRITE TORRENT RESUME.DAT
    beforeafterfile = open(beforeafterpath, "rb").readlines()
    for i in xrange(0, len(beforeafterfile), 2):
        beforeandhash = beforeafterfile[i].strip().split(" / ")
        afterandhash = beforeafterfile[i + 1].strip().split(" / ")
        before = beforeandhash[0].decode("utf-8")
        beforehash = beforeandhash[1]
        after = afterandhash[0].decode("utf-8")
        afterhash = afterandhash[1]
        if beforehash == afterhash:
            thehash = beforehash
        else:
            print "Error. You have inadvertently modified one of the hash files, and there is a hash mismatch between before/after entries."
            print "Cannot continue. Exiting. Please save your changes into a new file, locate your error, and re-run and fix it..."
            print "Another possibility is you were missing a / (with 1 character of whitespace on each side surrounding it) as a seperator."
        # searches the dict's keys for a Hash, if exists. and if so, can be used as the [indexid]
        if thehash in reverselookup:
            key = reverselookup[thehash][0]
            torrentlist[key]["caption"] = after[after.rfind("\\") + 1 :]
            try:
                # prints a number to console to show progress. corresponds to the numbers in the file (every-two-lines).  (tip:) to show incremental numbers use (((i+1)/2)+1)
                # filenames printed to console, will be missing any unicode chars because the windows console is not unicode compatible!!!! (annoying)
                print i, before.encode("ascii", errors="ignore")
                print i + 1, after.encode("ascii", errors="ignore")
                os.rename(before, after)
            except Exception as e:
                traceback.print_exc()  # will output any errors to console but keep going
            torrentlist[key]["path"] = after
            if after.endswith(".mp3") or after.endswith(
                ".flac"
            ):  # .mp3 .flac = I personally didnt have any "Single file" .ogg, .aac, etc that needed special handling in this manner
                if torrentlist[key].has_key(
                    "targets"
                ):  # these lines are a quick fix, for an oversight in the uTorrent process. changing path is not enough
                    torrentlist[key]["targets"][0][1] = after[
                        after.rfind("\\") + 1 :
                    ]  # single-file-mode torrents have a "targets" list that controls the filename

    torrentlist["rec"] = rec  # add the thing we removed back in so we dont break anything (not sure what this is)
    # fileguard does not need to go back, in fact, purposefully needs to stay out.
    newfile.write(bencode.bencode(torrentlist))
    newfile.close()
    print "Finished writing: ", newfile.name