Python cleanの例、text.clean Pythonの例

コード例 #1

0

ファイルを表示

ファイル: arxiv.py プロジェクト: B4dWo1f/blnarXiv

def parse_arxiv(fname,URLbase='https://arxiv.org'):
   """ Parse the html code from an arxiv "new" section """
   today = dt.datetime.now().date()
   html_doc = open(fname,'r').read()
   S = BeautifulSoup(html_doc, 'html.parser')
   h3 = S.find('h3').text
   date = h3.split('for')[-1].lstrip().rstrip()
   date = dt.datetime.strptime(date,'%a, %d %b %y').date()

   NewSub = []
   CrossList = []
   Replacements = []
   sections = [NewSub, CrossList, Replacements]
   titles = []
   sect = 0
   for dl,h3 in zip(S.find_all('dl'),S.find_all('h3')):
      ## Skip replacements
      section = h3.text.split('for')[0].lstrip().rstrip() 
      if section not in ['New submissions', 'Cross-lists']: continue
      ## report section.  #TODO log this
      # h = '** ' + h3.text + ' *'
      # while len(h) < console.getTerminalSize()[0]:
      #    h += '*'
      h = h3.text
      titles.append(h)
      for dt_tag,dd_tag in zip(dl.find_all('dt'), dl.find_all('dd')):
         ## parsing dt tag
         index = int(dt_tag.find('a').text.replace('[','').replace(']',''))
         arxivID = dt_tag.find('a',title='Abstract').text.replace('arXiv:','')
         URLabs = URLbase + dt_tag.find('a',title='Abstract')['href']
         URLpdf = URLbase + dt_tag.find('a',title='Download PDF')['href']
         ## parsing dd tag
         # Title
         title = dd_tag.find('div',class_='list-title mathjax').text
         title = text.clean(title.replace('Title: ',''))
         # Authors
         authors = []
         for auth in dd_tag.find('div',class_='list-authors').find_all('a'):
            authors.append( Author(auth.text,URLbase+auth['href']) )
         # Subjects
         subjects = dd_tag.find('div',class_='list-subjects').text
         subjects = subjects.replace('Subjects: ','').split(';')
         subjects = [x.lstrip().rstrip() for x in subjects]
         # Abstract
         abstract = text.clean(dd_tag.find('p', class_='mathjax').text)
         ## arXiv entry
         A = ArXivEntry(title=title, authors=authors, abstract=abstract,
                         subject=subjects, ID=arxivID, urlabs=URLabs,
                         urlpdf=URLpdf, index=index)
         sections[sect].append(A)
      sect += 1
   return titles,sections

コード例 #2

0

ファイルを表示

ファイル: churn.py プロジェクト: bward/ultra

    def get_score(self, input):
        #clean up input, doesn't matter what we do to it as this never gets returned
        input = text.clean(input.upper(), self.alphabet)
        
        length = len(input)
        if self.tetragraphs:
            score=0
            for j in range(length-3):
                fi=self.alphabet.index(input[j])
                se=self.alphabet.index(input[j+1])
                th=self.alphabet.index(input[j+2])
                fo=self.alphabet.index(input[j+3])
                position=17576*fi +676*se +26*th +fo
                score += self.frequencies[position]
            return score
        
        else:
            score=0
            for j in range(length-1):
                fi=self.alphabet.index(input[j])
                se=self.alphabet.index(input[j+1])
                product=26*fi+se
                score += self.frequencies[product]

            return score

コード例 #3

0

ファイルを表示

ファイル: adfgvx.py プロジェクト: bward/ultra

    def dict(self):
        in_str = self._display.get_input()
        dict_location = self._display.key_dialog(textboxes=[('Dictionary location', 'cipher/dict.txt')], title = 'Dictionary location')[1][0]
        dict = [text.clean(word).upper() for word in open(dict_location).readlines()]
        transpo = transposition.Transposition(None, independent = False)
        monoalph = monoalphabetic.Monoalphabetic(None, independent = False)
        freq = frequencies.Frequencies(None, independent = False)

        best_score = 10000.0
        best_key = ''
        for word in dict:
            if len(word) > 0:
                decrypted = transpo._perform((word, 0, 1, 0, 0), in_str)
                score = freq.analyse(decrypted, graph = False)
                if score < best_score:
                    best_score = score
                    best_key = word
                if score < 10:
                     print '%s : %d : %s' % (word, score, self.decrypt((self.generate_key(decrypted), 'A', 1), decrypted)[:20])
        print 'Best key is %s' % best_key
        best_key = self._display.key_dialog(textboxes=[('Transposition key', str(best_key).strip('()'))], title = 'Transposition Key')[1][0]
        in_str = transpo._perform((best_key, 0, 1, 0, 0), in_str)
        f_key = self.generate_key(in_str)
        in_str = self.decrypt((f_key, '1', 0), in_str)
        monoalph = monoalphabetic.Monoalphabetic(self._display, independent = False)
        f_key = monoalph.crack_key(in_str)
        out_str = monoalph.decrypt(f_key, in_str)
        self._display.show_output(out_str)

コード例 #4

0

ファイルを表示

ファイル: comprehend.py プロジェクト: robinstauntoncollins/rightcall

def get_key_phrases(text, language_code='en'):
    comprehend = boto3.client('comprehend')
    logger.info("Getting key phrases...")
    if getsizeof(text) >= COMPREHEND_SIZE_LIMIT:
        logger.info("Too big! Proceeding with chunkification")
        chunks, weights = chunkify(text, COMPREHEND_SIZE_LIMIT)
        try:
            r = comprehend.batch_detect_key_phrases(
                TextList=[val['text'] for val in chunks.values()],
                LanguageCode=language_code)
            kps = []
            for kp_list in r['ResultList']:
                kps = kps + kp_list['KeyPhrases']

        except Exception as e:
            logger.error(str(e))
            raise e
    else:
        try:
            r = comprehend.detect_key_phrases(Text=text, LanguageCode='en')
            kps = r['KeyPhrases']
        except Exception as e:
            logger.error(str(e))
            raise e

    return text_processing.clean(list(set(x['Text'] for x in kps)))

コード例 #5

0

ファイルを表示

ファイル: hill.py プロジェクト: bward/ultra

    def crack_key(self, in_str):
        rvalues, tvalues = self._display.key_dialog(radioboxes = [['Auto solve', 'Crib']], textboxes= [['Key length', '2'], ['Crib', ''], ['Index', '0']], title = self.title + ' cracker')
        in_str = text.clean(in_str)
        freq = frequencies.Frequencies(None, independent=False)
        if rvalues[0]==0:
            try: key_length = int(tvalues[0])
            except: self._display.error_dialog('Invalid key length')
            

            if key_length == 2:
                
                most_common = Counter(self.split_str(in_str, 2)).most_common(2)
                d1, d2 = most_common[0][0], most_common[1][0]
                b = matrix([[_alph[0].index(d1[0]), _alph[0].index(d2[0])],[_alph[0].index(d1[1]), _alph[0].index(d2[1])]])
                print b
                print '-----------------------'
                keys = []
                for d in self.digraph_perms:
                    v1, v2, v3, v4 = _alph[0].index(d[0][0]), _alph[0].index(d[0][1]), _alph[0].index(d[1][0]), _alph[0].index(d[1][1])
                    a = matrix([[v1, v3],[v2, v4]])
                    print a
                    print d
                    try:
                        key = (b * self.mod_invert_matrix(a)) % 26

                        keys.append(key)
                    except: print 'fail'                    
                    
                
                
                min_score = None
                best_key = None
                print keys

                for key in keys:
                    print '----'
                    print 'decrypting with'
                    print key
                    score = 0
                    out_str = self.decrypt(key, in_str)
                    if out_str:
                        percentages, counts, exp_percentages = freq.run(out_str, title = self.title)
                        for i in range(_alph_length):
                            diff_sq = (exp_percentages[i] - percentages[i]) ** 2
                            score += diff_sq
                        print score
                        if score < min_score or min_score == None:
                            min_score = score
                            best_key = key
                print best_key
                print min_score

                return ''.join([',' + str(best_key.item(i)) if i > 0 else  str(best_key.item(i)) for i in range(len(best_key)**2)])

コード例 #6

0

ファイルを表示

ファイル: bayes.py プロジェクト: olticher/cs102

 def predict(self, X):
     """ Perform classification on an array of test vectors X. """
     classification = []
     for title in X:
         words_list = clean(title)
         p_list = []
         for class_ in self.classes:
             ln_p_class = log(self.classes[class_]['prior'])
             for word in words_list:
                 if self.words[word][class_]:
                     ln_p_class += log(self.words[word][class_])
             p_list.append((class_, ln_p_class))
         classification.append(max(p_list, key=lambda x: x[1]))
     return classification

コード例 #7

0

ファイルを表示

ファイル: autocorrelation.py プロジェクト: bward/ultra

 def correlations(self, in_str):
     limit = 100
     in_str = text.clean(in_str, self.alph_upp + self.alph_low)
     offset_str = in_str
     data = [0] * (limit + 1)
     # This is the actual offsetty county bit.
     for offset in range(1, len(in_str) - 1):
         offset_str = offset_str[1:] + offset_str[0]
         for index, in_char in enumerate(in_str):
             if in_char == offset_str[index]:
                 data[offset] += 1
         if offset == limit:
             break
     return data

コード例 #8

0

ファイルを表示

ファイル: hill.py プロジェクト: bward/ultra

    def encrypt(self, key, in_str):
        in_str = [_alph[0].index(c) if c in _alph[0] else _alph[1].index(c) for c in text.clean(in_str)]
        out_str = []

        for i in range(0, len(in_str), len(key)):
            digraph = []
            for j in range(len(key)):
                digraph.append([in_str[i+j]])
            digraph = (key * matrix(digraph)) % 26
            for c in digraph:
                out_str.append(c)

        out_str = ''.join([_alph[0][c] for c in out_str])
        return out_str

コード例 #9

0

ファイルを表示

ファイル: monoalphabetic.py プロジェクト: bward/ultra

 def sanitize_key(self, key):
     global _alph_length, _alph
     if key == None:
         return None
     else:
         key = key.upper()
         key = text.clean(key, _alph[0])
         key_charpool = key + _alph[0]
         key = ''
         i = 0
         while len(key) != _alph_length:
             if key_charpool[i] not in key:
                 key += key_charpool[i]
             i += 1
     return key

コード例 #10

0

ファイルを表示

ファイル: bayes.py プロジェクト: olticher/cs102

 def fit(self, X, y):
     """ Fit Naive Bayes classifier according to X, y. """
     all_words = []
     pairs = []
     self.classes_counter = dict(Counter(y))
     for title, class_ in zip(X, y):
         words_list = clean(title)
         for word in words_list:
             all_words.append(word)
             pairs.append((word, class_))
             self.classes[class_]['appearances'] += 1
     for class_ in self.classes:
         self.classes[class_]['prior'] = self.classes_counter[class_] / len(
             y)
     self.words_counter = dict(Counter(all_words))
     self.pairs_counter = dict(Counter(pairs))
     d = len(self.words_counter)
     for word in self.words_counter:
         for class_ in self.classes:
             self.words[word][class_] = (self.pairs_counter.get(
                 (word, class_), 0) + self.factor) / (
                     self.classes[class_]['appearances'] + self.factor * d)

コード例 #11

0

ファイルを表示

def get_paper_info(url):
    """ Assumes url structure:  https://arxiv.org/abs/arXiv.id """
    if url[-1] == '/': url = url[:-1]
    arXiv_id = url.split('/')[-1]
    urlpdf = url.replace('/abs/', '/pdf/')
    html_doc = make_request(url)
    S = BeautifulSoup(html_doc, 'html.parser')
    title = S.find('h1', class_='title mathjax').text.replace('Title:\n', '')
    title = ' '.join(title.lstrip().rstrip().split())
    for x in S.find_all('div', class_='authors'):
        x = x.text.replace('Authors:\n', '')
        authors = []
        for author in x.split('\n'):
            auth = author.replace(',', '').split()
            author = ' '.join(auth)
            authors.append(author)
    txt = S.find('blockquote',
                 class_='abstract mathjax').text.lstrip().rstrip()
    txt = txt.replace('Abstract: ', '')
    abstract = text.clean(txt)
    subjects = [s.text for s in S.find_all('span', class_='primary-subject')]
    return arXiv_entry(title, authors, abstract, subjects, arXiv_id, urlpdf)

コード例 #12

0

ファイルを表示

ファイル: playfair.py プロジェクト: bward/ultra

 def crypt(self, key, in_str, encrypt = False):
     #Create key square
     rows, cols = self._keysquare(key)
     
     # Clean the in_str so we don't bugger up the digraphs
     in_str = text.clean(in_str)
     
     #Fix double letters in plaintext and pad, if we're encrypting
     if encrypt:
         i=0
         list_input = list(in_str)
         while i < len(list_input) - 1:
             if list_input[i] == list_input[i+1]:
                 list_input.insert(i+1,'X')
             i += 1
         in_str = ''.join(list_input)
         
         if len(in_str)%2 != 0:
             in_str += 'X'
         
     #Iterate digraphs
     out_str = ''
     for i in range(0, len(in_str)/2):
         digraph = in_str[i*2:i*2+2]
         first = digraph[0]
         second = digraph[1]
         #Find row/column of each character
         for i in range(0,5):
             #Get row/column indices
             if rows[i].find(first) != -1:
                 fri = i
             if rows[i].find(second) != -1:
                 sri = i
             if cols[i].find(first) != -1:
                 fci = i
             if cols[i].find(second) != -1:
                 sci = i
         #We have the row/column of each letters, so time to go through the three cases as defined here: http://en.wikipedia.org/wiki/Playfair_cipher#Clarification_with_picture
         if encrypt:
             if fri == sri:
                 #case 1
                 plain_digraph = rows[fri][(fci + 1) % 5] + rows[sri][(sci + 1) % 5]
             
             
             elif fci == sci:
                 #case 2
                 plain_digraph = cols[fci][(fri + 1) % 5] + cols[sci][(sri + 1) % 5]
         
             else:
                 #case 3
                 plain_digraph = rows[fri][sci] + rows[sri][fci]
                 
         else:
             if fri == sri:
                 #case 1
                 plain_digraph = rows[fri][(fci - 1) % 5] + rows[sri][(sci - 1) % 5]
             
             
             elif fci == sci:
                 #case 2
                 plain_digraph = cols[fci][(fri - 1) % 5] + cols[sci][(sri - 1) % 5]
         
             else:
                 #case 3
                 plain_digraph = rows[fri][sci] + rows[sri][fci]
         
         out_str += plain_digraph
         
     return out_str

コード例 #13

0

ファイルを表示

ファイル: pdfreader.py プロジェクト: ViktorOdin/EbookMining

 def getTitle(self):
     "Retourne le titre du document, ou None s'il n'est pas défini."
     if self.title is None:
         return None
     else:
         return clean(self.title)

コード例 #14

0

ファイルを表示

ファイル: pdfreader.py プロジェクト: ViktorOdin/EbookMining

 def getAuthor(self):
     "Retourne l'auteur du document, ou None s'il n'est pas défini."
     if self.author is None:
         return None
     else:
         return clean(self.author)

コード例 #15

0

ファイルを表示

ファイル: textops.py プロジェクト: bward/ultra

 def _perform(self, key, in_str):
     out = text.clean(in_str, key, one_case = False)
     return out

コード例 #16

0

ファイルを表示

 while len(h) < 80:
     h = h + '*'
 titles.append(h)
 for dt_tag,dd_tag in zip(dl.find_all('dt'),\
                          dl.find_all('dd')):
     ## parsing dt tag
     index = int(
         dt_tag.find('a').text.replace('[', '').replace(']', ''))
     arxivID = dt_tag.find('a',
                           title='Abstract').text.replace('arXiv:', '')
     URLabs = URLbase + dt_tag.find('a', title='Abstract')['href']
     URLpdf = URLbase + dt_tag.find('a', title='Download PDF')['href']
     ## parsing dd tag
     # Title
     title = dd_tag.find('div', class_='list-title mathjax').text
     title = text.clean(title.replace('Title: ', ''))
     # Authors
     authors = []
     for auth in dd_tag.find('div',
                             class_='list-authors').find_all('a'):
         authors.append(author(auth.text, URLbase + auth['href']))
     # Subjects
     subjects = dd_tag.find('div', class_='list-subjects').text
     subjects = subjects.replace('Subjects: ', '').split(';')
     subjects = [x.lstrip().rstrip() for x in subjects]
     # Abstract
     abstract = text.clean(dd_tag.find('p', class_='mathjax').text)
     ## arXiv entry
     A = arXiv_entry(title=title,
                     authors=authors,
                     abstract=abstract,