Example #1
0
def scrape_links_and_wordlistify(links, lower=False, verbose=1):
    import nltk
    import requests
    import string

    raw = ""
    wordlist = {}
    for site in links:
        try:
            if verbose == 1:
                print "[+] fetching data from: ", site
            if site.find("http://pastebin.com/") == 0:
                raw = requests.get(site.replace("http://pastebin.com/", "http://pastebin.com/raw.php?i=")).content
            else:
                raw = requests.get(site).content
            if lower == False:
                l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, " " * 32)).split()
                freq_an(l, wordlist)
            else:
                l = string.lower(nltk.clean_html(raw))
                l = string.translate(l, string.maketrans(string.punctuation, " " * 32)).split()
                freq_an(l, wordlist)
        except:
            if verbose == 1:
                print "[-] Skipping url: ", site
    return wordlist
Example #2
0
def html_for_url_node(node):
    if not re.match("javascript:", node["url"]):
        linkURL = sanitize(node["url"])
        keysURL = linkURL
        ktrspaces = "                            "
        ktrtable = string.maketrans("!@#$%^&*()_+-=`~;:'\",<.>/?\\|", ktrspaces)
        keysURL = str(keysURL).translate(ktrtable, "").lower()
        keysURL = str(keysURL).translate(None, "!@#$%^&*()_+-=`~;:'\",<.>/?\\|").lower()
        #
        tags = sanitize(node["name"])
        # tags= node['name'] Check for UTF-8 etc...
        # print "TAGS: ",tags
        # tags = tags.translate(None,'!@#$%^&*()_+-=`~;:\'",<.>/?\\|')
        # tags.translate(None,'!@#$%^&*()_+-=`~;:\'",<.>/?\\|')
        # trtable =                          '                              '
        trspaces = "                            "
        trtable = string.maketrans("!@#$%^&*()_+-=`~;:'\",<.>/?\\|", trspaces)
        # tags = str(tags).translate(trtable,'!@#$%^&*()_+-=`~;:\'",<.>/?\\|')
        tags = str(tags).translate(trtable, "").lower()
        tags = str(tags).translate(None, "!@#$%^&*()_+-=`~;:'\",<.>/?\\|").lower()
        #
        allTags = tags + " " + keysURL
        print "# '", sanitize(node["url"]), "'", allTags
        # print '# \'',sanitize(node['url']),'\'', tags
        return '<dt><a href="%s">%s</a>\n' % (sanitize(node["url"]), sanitize(node["name"]))
    else:
        return ""
Example #3
0
def autogen_bibtex_key(obj):
    keyauthorcompact = u""
    keytitlecompact = string.join(
        [
            i[:3]
            for i in string.split(obj.Title().encode("utf-8").translate(string.maketrans("", ""), string.punctuation))
            if len(i) > 3
        ],
        "",
    )  # ouch ... this compacts the first three letters of each word in the title together, to make a dependably unique key
    if obj.AuthorIsCorporate() and obj.Author():
        keyauthorcompact = string.join(
            [
                i[:1]
                for i in string.split(
                    obj.Author().encode("utf-8").translate(string.maketrans("", ""), string.punctuation)
                )
            ],
            "",
        )  # same for authors, but just initials
    elif obj.Author():
        # print split_name_list(obj.Author())
        keyauthorcompact = string.split(
            obj.Author().encode("utf-8").translate(string.maketrans("", ""), string.punctuation)
        )[-1]
    key = "%s%s_%s" % (keyauthorcompact, str(obj.ReferDate().year), keytitlecompact)
    return unicode(key, "utf8", "ignore")
Example #4
0
def translate(read, ref, match):
    read = read[::-1]
    read = read.translate(string.maketrans("ACGTacgt", "TGCAtgca"))
    ref = ref[::-1]
    ref = ref.translate(string.maketrans("ACGTacgtRYKMBVDH", "TGCAtgcaYRMKVBHD"))
    match = match[::-1]
    return (read, ref, match)
Example #5
0
 def checkData(self,row):
     jobTime = row[4]
     jobTerm = row[5]
     w1={'full time':1,'part time':-1,'':0}    #this dictionary corresponds to time feature
     w2={'permanent':0,'contract':1,'':-1}     #this dictionary corresponds to term feature
     
     if jobTime == '' or jobTerm == '':
         s=row[2].lower()          
         s=s.translate(string.maketrans("‘’‚“”„†‡‰‹›!“#$%&‘()™*+,-˜./0123456789:;<=>?@[\]_`{|}~–—¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿Þßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ€¢â—ªïž'","                                                                                                                                "))
         if jobTime=='':
             if ('full time' in s and 'part time' in s) or ('full time' not in s and 'part time' not in s):
                 word1=''
             else:
                 if 'full time' in s:      #searching full time in description
                     word1='full time'
                 else:
                     word1='part time'
         else:
             word1=jobTime.translate(string.maketrans("_"," ")) #removing underscore from time feature value
             
         if jobTerm=='':
             if ('permanent' in s and 'contract' in s) or ('permanent' not in s and 'contract' not in s):
                 word2=''
             else:
                 if 'permanent' in s:      #searching permanent in description
                     word2='permanent'
                 else:
                     word2='contract'
         else: word2=jobTerm.translate(string.maketrans("_"," "))   #removing underscore from term feature value
     
     else:
         word1=jobTime.translate(string.maketrans("_"," "))
         word2=jobTerm.translate(string.maketrans("_"," "))
         
     return [word1,w1[word1],word2,w2[word2]]
Example #6
0
 def retrieve_access_token(self):
     output.speak(_("Please wait while an access token is retrieved from Twitter."), True)
     httpd = BaseHTTPServer.HTTPServer(("127.0.0.1", 8080), Handler)
     twitterDataOrig = str(self.config["oauth"]["twitterData"])
     trans = maketrans("-_~", "+/=")
     twitterDataTrans = twitterDataOrig.translate(trans)
     twitterData = b64decode(twitterDataTrans)
     twitterData = literal_eval(twitterData)
     tw = Twython(twitterData[0], twitterData[1], auth_endpoint="authorize")
     try:
         auth = tw.get_authentication_tokens("http://127.0.0.1:8080")
     except SSLError:
         output.speak(
             _(
                 "Sorry, we can't connect to Twitter. You may want to adjust your firewall or antivirus software appropriately"
             ),
             True,
         )
     webbrowser.open_new_tab(auth["auth_url"])
     global logged, verifier
     logged = False
     while logged == False:
         httpd.handle_request()
     self.auth_handler = Twython(twitterData[0], twitterData[1], auth["oauth_token"], auth["oauth_token_secret"])
     token = self.auth_handler.get_authorized_tokens(verifier)
     output.speak(_("Retrieved access token from Twitter."), True)
     httpd.server_close()
     data = [token["oauth_token"], token["oauth_token_secret"]]
     eData = dumps(data)
     trans = maketrans("+/=", "-_~")
     eData = b64encode(eData)
     eData = eData.translate(trans)
     self.config["oauth"]["userData"] = eData
     self.login()
     del (httpd, auth, tw, token, logged, verifier, twitterData, twitterDataOrig, data, edata, self.auth_handler)
Example #7
0
def create_node_list(sentence1, sentence2, threshold=1.4):
    node_list1 = []
    node_list2 = []
    sentence1 = sentence1.translate(string.maketrans("", ""), string.punctuation)
    sentence2 = sentence2.translate(string.maketrans("", ""), string.punctuation)
    # average_rank1 = sum([node_rank(word) for word in sentence1.split()]) / len(sentence1)
    # average_rank2 = sum([node_rank(word) for word in sentence2.split()]) / len(sentence2)
    sentence1_ranks = map(lambda x: node_rank(x), sentence1.split())
    sentence2_ranks = map(lambda x: node_rank(x), sentence2.split())
    avg_rank1 = sum(sentence1_ranks) / len(sentence1)
    avg_rank2 = sum(sentence2_ranks) / len(sentence2)
    STOPWORDS = generate_stopword_list("stopwords.txt")

    index = 0
    for word in sentence1.split():
        word = word.lower()
        if word not in STOPWORDS:
            if (sentence1_ranks[index] / avg_rank1) >= threshold:
                node_list1.append(word)
        index += 1
    index = 0
    for word in sentence2.split():
        word = word.lower()
        if word not in STOPWORDS:
            if (sentence2_ranks[index] / avg_rank2) >= threshold:
                node_list2.append(word)
        index += 1

    return node_list1, node_list2
def wordFrequencies(comments, submissions):

    for comment in comments:
        words = comment.split()
        table = string.maketrans("", "")

        for word in words:
            if word[:4] == "http":
                frequencies[word] = frequencies.get(word, 0) + 1
            else:
                # next line translates unicode to string so we can use .translate
                sword = unicodedata.normalize("NFKD", word).encode("ascii", "ignore")
                lowercase = sword.lower()
                correctWord = lowercase.translate(table, string.punctuation)
                # count the number of times a user uses a given word
                frequencies[correctWord] = int(frequencies.get(correctWord, 0)) + 1

    for submission in submissions:
        subs = submission.split()
        table = string.maketrans("", "")

        for s in subs:
            if s[:4] == "http":
                frequencies[s] = frequencies.get(s, 0) + 1
            else:
                sword = unicodedata.normalize("NFKD", s).encode("ascii", "ignore")
                # lowercase = sword.lowercase() #this doesn't work y?
                correctWord = sword.translate(table, string.punctuation)
                frequencies[correctWord] = int(frequencies.get(correctWord, 0)) + 1

    return frequencies
Example #9
0
 def checkreserveduser(user):
     if os.path.isdir(os.path.join(config.datadir, user)):
         return
     if user.lower() != user:
         errorexit("Username should be lowercase.")
     normalized = user.lower()
     if os.path.isdir(os.path.join(config.datadir, normalized)):
         errorexit("Username is reserved.")
     normalized = user.lower()
     if normalized != user and os.path.isdir(os.path.join(config.datadir, normalized)):
         errorexit("Username is reserved.")
     normalizedi = normalized.translate(string.maketrans("013456789", "oieasbtbg"))
     if normalized != normalizedi and os.path.isdir(os.path.join(config.datadir, normalizedi)):
         errorexit("Username is reserved.")
     normalizedl = normalized.translate(string.maketrans("013456789", "oleasbtbg"))
     if normalizedl != normalized and os.path.isdir(os.path.join(config.datadir, normalizedl)):
         errorexit("Username is reserved.")
     with open(os.path.join(progpath, "bad-words.txt")) as f:
         badwords = f.read().splitlines()
     if any(word in badwords for word in [normalized, normalizedi, normalizedl]):
         errorexit("Username is reserved.")
     with open(os.path.join(progpath, "bad-substrings.txt")) as f:
         badsubstrings = f.read().splitlines()
     if any(substring in word for word in [normalized, normalizedi, normalizedl] for substring in badsubstrings):
         errorexit("Username is reserved.")
     return
Example #10
0
    def reverse_content(self,):
        from copy import deepcopy as deepcopy
        import string

        init_contigs_compo = deepcopy(self.composition)
        new_contigs_compo = deepcopy(self.new_compo)
        FragmentS = deepcopy(self.FragmentS)
        new_contigs_index = 0
        init_contigs_index = 0
        fragment_index = 0
        init_contigs_compo.reverse()
        for ele in init_contigs_compo:
            init_contigs_index += 1
            ele["orientation"] = ele["orientation"].translate(string.maketrans("wc", "cw"))
            ele["pos_id"] = init_contigs_index

        new_contigs_compo.reverse()
        for ele in new_contigs_compo:
            new_contigs_index += 1
            ele["orientation"] = ele["orientation"].translate(string.maketrans("wc", "cw"))
            ele["position"] = new_contigs_index

        FragmentS.reverse()
        for ele in FragmentS:
            fragment_index += 1
            ele.curr_id = fragment_index

        result = dict()
        result["init_contig_compo"] = init_contigs_compo
        result["new_contig_compo"] = new_contigs_compo
        result["FragmentS"] = FragmentS
        return result
Example #11
0
 def checkData(self,row):
     jobTime = row[4]
     jobTerm = row[5]
     w1={'full time':1,'part time':-1,'':0}    #this dictionary corresponds to time feature
     w2={'permanent':0,'contract':1,'':-1}     #this dictionary corresponds to term feature
     
     if jobTime == '' or jobTerm == '':
         s=row[2].lower()          
         s=s.translate(string.maketrans("‘’‚“”„†‡‰‹›!“#$%&‘()™*+,-�./0123456789:;<=>?@[\]_`{|}~–—΅Ά£¤¥¦§¨©�«¬®―°±²³΄µ¶·ΈΉΊ»Ό½ΎΏήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ�€Άβ—�ο�'","                                                                                                                                "))
         if jobTime=='':
             if ('full time' in s and 'part time' in s) or ('full time' not in s and 'part time' not in s):
                 word1=''
             else:
                 if 'full time' in s:      #searching full time in description
                     word1='full time'
                 else:
                     word1='part time'
         else:
             word1=jobTime.translate(string.maketrans("_"," ")) #removing underscore from time feature value
             
         if jobTerm=='':
             if ('permanent' in s and 'contract' in s) or ('permanent' not in s and 'contract' not in s):
                 word2=''
             else:
                 if 'permanent' in s:      #searching permanent in description
                     word2='permanent'
                 else:
                     word2='contract'
         else: word2=jobTerm.translate(string.maketrans("_"," "))   #removing underscore from term feature value
     
     else:
         word1=jobTime.translate(string.maketrans("_"," "))
         word2=jobTerm.translate(string.maketrans("_"," "))
         
     return [word1,w1[word1],word2,w2[word2]]
Example #12
0
    def __rhymeQuotient(comment):
        """
        Calculate the "rhymy-ness" of a comment, basically how many line ending words
        are similar to other line-ending words
        """
        # strip punctuation and numbers
        comment = comment.translate(string.maketrans("", ""), string.punctuation)
        comment = comment.translate(string.maketrans("", ""), string.digits)

        lines = comment.split("\n")

        lastWords = []
        for line in lines:
            words = line.split()
            if len(words) >= 1:
                lastWords.append(words[-1])

        # can't have rhymes if we have fewer than two lines
        if len(lastWords) < 2:
            return 0

        # print lastWords
        # now score each word by similarity with a following word
        for i in range(len(lastWords)):
            best = 0
            for j in range(i + 1, len(lastWords)):
                best = max(best, TimesComments.__sharedLetters(lastWords[i], lastWords[j]))
            lastWords[i] = best

        lastWords = map(lambda x: 5 if x >= 2 else x, lastWords)
        return sum(lastWords) / (len(lastWords) - 1)
Example #13
0
File: solve.py Project: yaeda/gdd11
def solve_partial(w, h, pattern, answer, fix_num, solve_num, reset_loop_count=RESET_LOOP_COUNT):
    trans_str_wall = answer[:fix_num]
    trans_table_wall = string.maketrans(trans_str_wall, "=" * len(trans_str_wall))
    trans_str_asta = answer[fix_num + solve_num : -1].replace("=", "")
    trans_table_asta = string.maketrans(trans_str_asta, "*" * len(trans_str_asta))
    pattern_rep = pattern.translate(trans_table_wall)
    pattern_rep = pattern_rep.translate(trans_table_asta)
    answer_rep = answer.translate(trans_table_wall)
    answer_rep = answer_rep.translate(trans_table_asta)

    ####### debug #######
    print "--------- pattern_rep"
    print_pattern(w, h, pattern_rep)
    print "--------- answer_rep"
    print_pattern(w, h, answer_rep)
    ####### debug #######

    move = solve_all(w, h, pattern_rep, answer_rep, reset_loop_count)

    ####### debug #######
    if move:
        pattern_work = create_pattern(w, h, pattern, move)
        print "--------- succeeded"
        print_pattern(w, h, pattern_work)
    else:
        print "--------- not succeeded"
    ####### debug #######
    return move
Example #14
0
def unlisted_words(sample, reference):
    # Sample string is cleaned and cut, removing punctuation/symbols, making all characters lowercase then splitting the string into a list based on spacing or newline
    s = (sample.translate(string.maketrans("", ""), string.punctuation)).lower().split()
    # Reference string is cleaned and cut, removing punctuation/symbols, making all characters lowercase then splitting the string into a list based on spacing or newline
    r = (reference.translate(string.maketrans("", ""), string.punctuation)).lower().split()
    # Using list comprehension identifies if a word in sample is in reference. The list is then converted to a set removing duplicates and transformed again into a list.
    return list(set([w for w in s if r.__contains__(w) == False]))
Example #15
0
def validate_input(allowed, inp):
    if allowed == None:
        allowed = string.ascii_letters + string.digits + "-_"
    inp = inp.encode("UTF-8")
    delete_table = string.maketrans(allowed, " " * len(allowed))
    table = string.maketrans("", "")
    return inp.translate(table, delete_table)
def get_encrypt_decrypt_tables():
    orig_table = string.maketrans(b"", b"")
    orig_table = [orig_table[i : i + 1] for i in range(len(orig_table))]
    orig_table_decrypt = string.maketrans(b"".join(orig_table), string.maketrans(b"", b""))

    encrypt_table, decrypt_table = get_table(bytes(PASSWORD))
    return encrypt_table, decrypt_table
Example #17
0
 def test_the_reverse_complement(self):
     """Check obj.reverse_complement() method."""
     mapping = ""
     for example1 in self._examples:
         if isinstance(example1, MutableSeq):
             continue
         try:
             comp = example1.reverse_complement()
         except ValueError as e:
             self.assertEqual(str(e), "Proteins do not have complements!")
             continue
         str1 = str(example1)
         # This only does the unambiguous cases
         if "U" in str1 or "u" in str1 or example1.alphabet == generic_rna:
             mapping = maketrans("ACGUacgu", "UGCAugca")
         elif (
             "T" in str1
             or "t" in str1
             or example1.alphabet == generic_dna
             or example1.alphabet == generic_nucleotide
         ):
             mapping = maketrans("ACGTacgt", "TGCAtgca")
         elif "A" not in str1 and "a" not in str1:
             mapping = maketrans("CGcg", "GCgc")
         else:
             # TODO - look at alphabet?
             continue
         self.assertEqual(str1.translate(mapping)[::-1], str(comp))
         self.assertEqual(comp.alphabet, example1.alphabet)
Example #18
0
    def __init__(self, translate_names=True, **kwargs):
        super(JUnit, self).__init__(**kwargs)

        if translate_names:
            self.name_table = string.maketrans("/.", ".-")
        else:
            self.name_table = string.maketrans("", "")
Example #19
0
def translator(frm="", to="", delete="", keep=None):
    """generate a translator which can be called on a string for substituting
    ``frm`` to ``to`` , deleting ``delete`` and keep the ``keep`` . This
    funciton is only for python 2 becuase the ``maketrans`` funciton returns
    a dict in python 3.

    examples:

    >>> trans = translator('1234567890', '!@#$%^&*()')
    >>> trans('a1b2c3d4e5f6g7')
    'a!b@c#d$e%f^g&'
    >>> trans = translator('1234567890', '!@#$%^&*()', '123')
    >>> trans('a1b2c3d4e5f6g7')
    'abcd$e%f^g&'
    >>> trans = translator('1234567890', '!@#$%^&*()', '123', '345')
    >>> trans('a1b2c3d4e5f6g7')
    '$%'
    >>> trans = translator('1234567890', '!@#$%^&*()', '123', '345ab')
    >>> trans('a1b2c3d4e5f6g7')
    'ab$%'
    """
    if len(to) == 1:
        to = to * len(frm)

    table = string.maketrans(frm, to)
    if keep is not None:
        all_chars = string.maketrans("", "")
        delete = all_chars.translate(all_chars, keep.translate(all_chars, delete))

    def translate(s):
        return s.translate(table, delete)

    return translate
    def generateCounts(self):
        wordCounts = {}
        hashtagCounts = {}

        for tweet in self.trainSet:
            hashtags = []
            for word in tweet.split():
                if word.startswith("#") and len(word) > 2:
                    word = word.lower().translate(string.maketrans("", ""), string.punctuation)  # remove punctuation
                    hashtags.append(word)
                    if word not in wordCounts:
                        wordCounts[word] = 1
                    else:
                        wordCounts[word] += 1
                else:
                    if "@" in word:
                        continue
                    if word in self.stopWords:
                        continue
                    word = word.lower().translate(string.maketrans("", ""), string.punctuation)  # remove punctuation
                    if word not in wordCounts:
                        wordCounts[word] = 1
                    else:
                        wordCounts[word] += 1

            for hashtag in hashtags:
                if hashtag not in hashtagCounts:
                    hashtagCounts[hashtag] = 1.0
                else:
                    hashtagCounts[hashtag] += 1.0

        return wordCounts, hashtagCounts
Example #21
0
def make_table(do_encrypt=True):
    original = "".join([chr(i) for i in range(0, 128)])
    encrypted = "".join([chr(i * 2 % 127) for i in range(0, 128)])
    if do_encrypt:
        return string.maketrans(original, encrypted)
    else:
        return string.maketrans(encrypted, original)
    def generateHashtagSpecificVocabulary(self):
        wordsMappedToHashtags = {}

        for tweet in self.trainSet:
            words = []
            hashtags = []
            for word in tweet.split():
                if word.startswith("#") and len(word) > 2:
                    word = word.lower().translate(string.maketrans("", ""), string.punctuation)  # remove punctuation
                    hashtags.append(word)
                    words.append(word)
                else:
                    if "@" in word:
                        continue
                    if word in self.stopWords:
                        continue
                    word = word.lower().translate(string.maketrans("", ""), string.punctuation)  # remove punctuation
                    words.append(word)

            for hashtag in hashtags:
                if hashtag not in wordsMappedToHashtags:
                    wordsMappedToHashtags[hashtag] = {}
                for word in words:
                    if word not in wordsMappedToHashtags[hashtag]:
                        wordsMappedToHashtags[hashtag][word] = 1.0
                    else:
                        wordsMappedToHashtags[hashtag][word] += 1.0

        return wordsMappedToHashtags
Example #23
0
def buildGoodSet(goodChars=string.printable, badChar="?"):
    """Build a translation table that turns all characters not in goodChars
    to badChar"""
    allChars = string.maketrans("", "")
    badchars = string.translate(allChars, allChars, goodChars)
    rv = string.maketrans(badchars, badChar * len(badchars))
    return rv
Example #24
0
def complement(seq_list):
    if seq_list.__class__ == list:
        for i in range(len(seq_list)):
            seq_list[i] = seq_list[i].translate(maketrans("atgcATGC", "tacgTACG"))
        return seq_list
    elif seq_list.__class__ == str:
        return seq_list.translate(maketrans("atgcATGC", "tacgTACG"))
Example #25
0
def hashword(plaintext):
    """
    Munge a plaintext word into something else. Hopefully, the result
    will have some mnemonic value.
    """
    # get a list of random bytes. A byte will be randomly picked from
    # this list when needed.
    rb = getrandomlist()
    # 0.25 chance of case being swapped
    if rb[rb[0]] < 64:
        plaintext = string.swapcase(plaintext)
    # 0.50 chance of vowels being translated one of two ways.
    if rb[rb[2]] > 127:
        plaintext = string.translate(plaintext, string.maketrans("aeiou AEIOU", "@3!0& 4#10%"))
    else:
        plaintext = string.translate(plaintext, string.maketrans("aeiou AEIOU", "^#1$~ $3!0&"))
    # 0.4 chance of some additional consonant translation
    if rb[rb[4]] < 102:
        plaintext = string.translate(plaintext, string.maketrans("cglt CGLT", "(<1+ (<1+"))
    # if word is short, add some digits
    if len(plaintext) < 5:
        plaintext = plaintext + ` rb[5] `
    # 0.2 chance of some more digits appended
    if rb[rb[3]] < 51:
        plaintext = plaintext + ` rb[205] `
    return plaintext
Example #26
0
def listShows(path, forBackLog=False):

    if not forBackLog:
        trans = string.maketrans(" ", ".")
    else:
        trans = string.maketrans(" ", "_")

    shows = {}
    try:
        for show_name in os.listdir(path):
            if os.path.isdir(path + show_name) == True:
                sanitized_show_name = show_name.translate(trans, "'().!").lower()
                shows[sanitized_show_name] = show_name
    except:
        log.error("Unable to find " + path)
        sys.exit()

    items = config["rss.mapping"].split(",")
    if len(items) > 0:
        for item in items:
            i = item.split("=")
            if len(i) > 0:
                local = i[0].strip()
                dist = i[1].strip().translate(trans, "'().!").lower()
                shows[dist] = local
                log.debug("Extra TV shows mapping : %s => %s" % (dist, local))

    return shows
def tm():

    import sys
    import nltk
    import string

    input_file_name = raw_input("Please enter the input file name: ")
    input_path = raw_input("Please enter the input path: ")
    output_file_name = raw_input("Please enter the output file name: ")
    print "\nPlease note that the above entered filename would be used as",
    print "a prefix for the entire set of documents to be generated.\n"
    output_path = raw_input("Please enter the output path: ")

    with open(input_path + "\\" + input_file_name + ".txt", "r") as f:

        para = []
        data = f.read()
        selected = 0
        notselect = 0
        sentences = data.split("\n\n")

        print "Total # of paragraphs", len(sentences)

        for x in xrange(len(sentences)):
            cond = sentences[x].endswith(".")
            if cond:
                cnt = sentences[x].count(".")
            else:
                cnt = sentences[x].count(".") + 1

            if cnt > 5:
                # print "paragraph ",x+1,"is selected"
                selected += 1
                sentences[x] = "@" + sentences[x].lower()
                sentences[x] = sentences[x].translate(string.maketrans("", ""), string.digits)
                sentences[x] = sentences[x].translate(string.maketrans("", ""), string.punctuation)
                tokens = nltk.word_tokenize(sentences[x])
                lemma = nltk.WordNetLemmatizer()
                porter = nltk.PorterStemmer()

                afewwords = [lemma.lemmatize(i) for i in tokens]
                afewwords = [porter.stem(i) for i in tokens]

                sentences[x] = " ".join(afewwords)
                para.append(sentences[x])

                filename = output_path + "\\" + output_file_name + str(selected) + ".txt"
                w = open(filename, "w")
                w.write("".join(para))
                w.close()
                para = []
            else:
                # print "paragraph ",x+1,"is not selected"
                notselect += 1
            # print "cnt - ", cnt
        # print"\n"

        print "# of paragraphs selected", selected
        print "# of paragraphs not selected", notselect
    f.close()
Example #28
0
    def add_entry(self, _format, unit, cause=None, effect=None):
        report_obj = self.report_formats[_format]
        colorized_list = []
        word_list = report_obj.string.split(" ")

        for word in word_list:
            color = ""
            if word.translate(string_module.maketrans("", ""), self.strip_string) == "%unit":
                word = word.replace("%unit", unit.name)
                color = self.colorize_unit_name(unit)
            elif (
                word.translate(string_module.maketrans("", ""), self.strip_string) == "%cause"
                and report_obj.cause_color
            ):
                word = word.replace("%cause", cause)
                color = report_obj.cause_color
            elif (
                word.translate(string_module.maketrans("", ""), self.strip_string) == "%effect"
                and report_obj.effect_color
            ):
                word = word.replace("%effect", effect)
                color = report_obj.effect_color
            if not color:
                color = "text"
            if report_obj.line_color:
                color = report_obj.line_color
            colorized_list.append(BattleReportWord(word, color))

        self.turn_report.append(colorized_list)
        self.process_report()
def build_sample_ids_transtable():
    """Build translation table for sample ids being MIENS compliant"""
    all_chars = "".join([chr(i) for i in range(128)])
    valid_sample_id_chars = letters + digits + "."
    non_valid_sample_id_chars = all_chars.translate(maketrans("", ""), valid_sample_id_chars)
    trans_table = maketrans(non_valid_sample_id_chars, "." * len(non_valid_sample_id_chars))
    return trans_table
Example #30
0
def url_sign(uri_path, params, client_id, signing_key):
    signing_key = signing_key.translate(string.maketrans("-_", "+/"))
    padding_factor = (4 - len(signing_key) % 4) % 4
    signing_key += "=" * padding_factor
    binary_key = base64.b64decode(unicode(signing_key).translate(dict(zip(map(ord, u"-_"), u"+/"))))

    # construct URI for signing
    uri_path_params = uri_path + "?"
    first = True
    for k in params.keys():
        if not first:
            uri_path_params += "&"
        else:
            first = False
        uri_path_params = "%(base)s%(key)s=%(value)s" % {
            "base": uri_path_params,
            "key": k,
            "value": urllib.quote_plus(str(params[k])),
        }
    uri_path_params += "&client=" + client_id

    # Sign
    digest = hmac.new(binary_key, uri_path_params, hashlib.sha1).digest()
    digest = base64.b64encode(digest)
    digest = digest.translate(string.maketrans("+/", "-_"))
    return "%s&sig=%s" % (uri_path_params, digest.rstrip("="))