def parse_arxiv(fname,URLbase='https://arxiv.org'): """ Parse the html code from an arxiv "new" section """ today = dt.datetime.now().date() html_doc = open(fname,'r').read() S = BeautifulSoup(html_doc, 'html.parser') h3 = S.find('h3').text date = h3.split('for')[-1].lstrip().rstrip() date = dt.datetime.strptime(date,'%a, %d %b %y').date() NewSub = [] CrossList = [] Replacements = [] sections = [NewSub, CrossList, Replacements] titles = [] sect = 0 for dl,h3 in zip(S.find_all('dl'),S.find_all('h3')): ## Skip replacements section = h3.text.split('for')[0].lstrip().rstrip() if section not in ['New submissions', 'Cross-lists']: continue ## report section. #TODO log this # h = '** ' + h3.text + ' *' # while len(h) < console.getTerminalSize()[0]: # h += '*' h = h3.text titles.append(h) for dt_tag,dd_tag in zip(dl.find_all('dt'), dl.find_all('dd')): ## parsing dt tag index = int(dt_tag.find('a').text.replace('[','').replace(']','')) arxivID = dt_tag.find('a',title='Abstract').text.replace('arXiv:','') URLabs = URLbase + dt_tag.find('a',title='Abstract')['href'] URLpdf = URLbase + dt_tag.find('a',title='Download PDF')['href'] ## parsing dd tag # Title title = dd_tag.find('div',class_='list-title mathjax').text title = text.clean(title.replace('Title: ','')) # Authors authors = [] for auth in dd_tag.find('div',class_='list-authors').find_all('a'): authors.append( Author(auth.text,URLbase+auth['href']) ) # Subjects subjects = dd_tag.find('div',class_='list-subjects').text subjects = subjects.replace('Subjects: ','').split(';') subjects = [x.lstrip().rstrip() for x in subjects] # Abstract abstract = text.clean(dd_tag.find('p', class_='mathjax').text) ## arXiv entry A = ArXivEntry(title=title, authors=authors, abstract=abstract, subject=subjects, ID=arxivID, urlabs=URLabs, urlpdf=URLpdf, index=index) sections[sect].append(A) sect += 1 return titles,sections
def get_score(self, input): #clean up input, doesn't matter what we do to it as this never gets returned input = text.clean(input.upper(), self.alphabet) length = len(input) if self.tetragraphs: score=0 for j in range(length-3): fi=self.alphabet.index(input[j]) se=self.alphabet.index(input[j+1]) th=self.alphabet.index(input[j+2]) fo=self.alphabet.index(input[j+3]) position=17576*fi +676*se +26*th +fo score += self.frequencies[position] return score else: score=0 for j in range(length-1): fi=self.alphabet.index(input[j]) se=self.alphabet.index(input[j+1]) product=26*fi+se score += self.frequencies[product] return score
def dict(self): in_str = self._display.get_input() dict_location = self._display.key_dialog(textboxes=[('Dictionary location', 'cipher/dict.txt')], title = 'Dictionary location')[1][0] dict = [text.clean(word).upper() for word in open(dict_location).readlines()] transpo = transposition.Transposition(None, independent = False) monoalph = monoalphabetic.Monoalphabetic(None, independent = False) freq = frequencies.Frequencies(None, independent = False) best_score = 10000.0 best_key = '' for word in dict: if len(word) > 0: decrypted = transpo._perform((word, 0, 1, 0, 0), in_str) score = freq.analyse(decrypted, graph = False) if score < best_score: best_score = score best_key = word if score < 10: print '%s : %d : %s' % (word, score, self.decrypt((self.generate_key(decrypted), 'A', 1), decrypted)[:20]) print 'Best key is %s' % best_key best_key = self._display.key_dialog(textboxes=[('Transposition key', str(best_key).strip('()'))], title = 'Transposition Key')[1][0] in_str = transpo._perform((best_key, 0, 1, 0, 0), in_str) f_key = self.generate_key(in_str) in_str = self.decrypt((f_key, '1', 0), in_str) monoalph = monoalphabetic.Monoalphabetic(self._display, independent = False) f_key = monoalph.crack_key(in_str) out_str = monoalph.decrypt(f_key, in_str) self._display.show_output(out_str)
def get_key_phrases(text, language_code='en'): comprehend = boto3.client('comprehend') logger.info("Getting key phrases...") if getsizeof(text) >= COMPREHEND_SIZE_LIMIT: logger.info("Too big! Proceeding with chunkification") chunks, weights = chunkify(text, COMPREHEND_SIZE_LIMIT) try: r = comprehend.batch_detect_key_phrases( TextList=[val['text'] for val in chunks.values()], LanguageCode=language_code) kps = [] for kp_list in r['ResultList']: kps = kps + kp_list['KeyPhrases'] except Exception as e: logger.error(str(e)) raise e else: try: r = comprehend.detect_key_phrases(Text=text, LanguageCode='en') kps = r['KeyPhrases'] except Exception as e: logger.error(str(e)) raise e return text_processing.clean(list(set(x['Text'] for x in kps)))
def crack_key(self, in_str): rvalues, tvalues = self._display.key_dialog(radioboxes = [['Auto solve', 'Crib']], textboxes= [['Key length', '2'], ['Crib', ''], ['Index', '0']], title = self.title + ' cracker') in_str = text.clean(in_str) freq = frequencies.Frequencies(None, independent=False) if rvalues[0]==0: try: key_length = int(tvalues[0]) except: self._display.error_dialog('Invalid key length') if key_length == 2: most_common = Counter(self.split_str(in_str, 2)).most_common(2) d1, d2 = most_common[0][0], most_common[1][0] b = matrix([[_alph[0].index(d1[0]), _alph[0].index(d2[0])],[_alph[0].index(d1[1]), _alph[0].index(d2[1])]]) print b print '-----------------------' keys = [] for d in self.digraph_perms: v1, v2, v3, v4 = _alph[0].index(d[0][0]), _alph[0].index(d[0][1]), _alph[0].index(d[1][0]), _alph[0].index(d[1][1]) a = matrix([[v1, v3],[v2, v4]]) print a print d try: key = (b * self.mod_invert_matrix(a)) % 26 keys.append(key) except: print 'fail' min_score = None best_key = None print keys for key in keys: print '----' print 'decrypting with' print key score = 0 out_str = self.decrypt(key, in_str) if out_str: percentages, counts, exp_percentages = freq.run(out_str, title = self.title) for i in range(_alph_length): diff_sq = (exp_percentages[i] - percentages[i]) ** 2 score += diff_sq print score if score < min_score or min_score == None: min_score = score best_key = key print best_key print min_score return ''.join([',' + str(best_key.item(i)) if i > 0 else str(best_key.item(i)) for i in range(len(best_key)**2)])
def predict(self, X): """ Perform classification on an array of test vectors X. """ classification = [] for title in X: words_list = clean(title) p_list = [] for class_ in self.classes: ln_p_class = log(self.classes[class_]['prior']) for word in words_list: if self.words[word][class_]: ln_p_class += log(self.words[word][class_]) p_list.append((class_, ln_p_class)) classification.append(max(p_list, key=lambda x: x[1])) return classification
def correlations(self, in_str): limit = 100 in_str = text.clean(in_str, self.alph_upp + self.alph_low) offset_str = in_str data = [0] * (limit + 1) # This is the actual offsetty county bit. for offset in range(1, len(in_str) - 1): offset_str = offset_str[1:] + offset_str[0] for index, in_char in enumerate(in_str): if in_char == offset_str[index]: data[offset] += 1 if offset == limit: break return data
def encrypt(self, key, in_str): in_str = [_alph[0].index(c) if c in _alph[0] else _alph[1].index(c) for c in text.clean(in_str)] out_str = [] for i in range(0, len(in_str), len(key)): digraph = [] for j in range(len(key)): digraph.append([in_str[i+j]]) digraph = (key * matrix(digraph)) % 26 for c in digraph: out_str.append(c) out_str = ''.join([_alph[0][c] for c in out_str]) return out_str
def sanitize_key(self, key): global _alph_length, _alph if key == None: return None else: key = key.upper() key = text.clean(key, _alph[0]) key_charpool = key + _alph[0] key = '' i = 0 while len(key) != _alph_length: if key_charpool[i] not in key: key += key_charpool[i] i += 1 return key
def fit(self, X, y): """ Fit Naive Bayes classifier according to X, y. """ all_words = [] pairs = [] self.classes_counter = dict(Counter(y)) for title, class_ in zip(X, y): words_list = clean(title) for word in words_list: all_words.append(word) pairs.append((word, class_)) self.classes[class_]['appearances'] += 1 for class_ in self.classes: self.classes[class_]['prior'] = self.classes_counter[class_] / len( y) self.words_counter = dict(Counter(all_words)) self.pairs_counter = dict(Counter(pairs)) d = len(self.words_counter) for word in self.words_counter: for class_ in self.classes: self.words[word][class_] = (self.pairs_counter.get( (word, class_), 0) + self.factor) / ( self.classes[class_]['appearances'] + self.factor * d)
def get_paper_info(url): """ Assumes url structure: https://arxiv.org/abs/arXiv.id """ if url[-1] == '/': url = url[:-1] arXiv_id = url.split('/')[-1] urlpdf = url.replace('/abs/', '/pdf/') html_doc = make_request(url) S = BeautifulSoup(html_doc, 'html.parser') title = S.find('h1', class_='title mathjax').text.replace('Title:\n', '') title = ' '.join(title.lstrip().rstrip().split()) for x in S.find_all('div', class_='authors'): x = x.text.replace('Authors:\n', '') authors = [] for author in x.split('\n'): auth = author.replace(',', '').split() author = ' '.join(auth) authors.append(author) txt = S.find('blockquote', class_='abstract mathjax').text.lstrip().rstrip() txt = txt.replace('Abstract: ', '') abstract = text.clean(txt) subjects = [s.text for s in S.find_all('span', class_='primary-subject')] return arXiv_entry(title, authors, abstract, subjects, arXiv_id, urlpdf)
def crypt(self, key, in_str, encrypt = False): #Create key square rows, cols = self._keysquare(key) # Clean the in_str so we don't bugger up the digraphs in_str = text.clean(in_str) #Fix double letters in plaintext and pad, if we're encrypting if encrypt: i=0 list_input = list(in_str) while i < len(list_input) - 1: if list_input[i] == list_input[i+1]: list_input.insert(i+1,'X') i += 1 in_str = ''.join(list_input) if len(in_str)%2 != 0: in_str += 'X' #Iterate digraphs out_str = '' for i in range(0, len(in_str)/2): digraph = in_str[i*2:i*2+2] first = digraph[0] second = digraph[1] #Find row/column of each character for i in range(0,5): #Get row/column indices if rows[i].find(first) != -1: fri = i if rows[i].find(second) != -1: sri = i if cols[i].find(first) != -1: fci = i if cols[i].find(second) != -1: sci = i #We have the row/column of each letters, so time to go through the three cases as defined here: http://en.wikipedia.org/wiki/Playfair_cipher#Clarification_with_picture if encrypt: if fri == sri: #case 1 plain_digraph = rows[fri][(fci + 1) % 5] + rows[sri][(sci + 1) % 5] elif fci == sci: #case 2 plain_digraph = cols[fci][(fri + 1) % 5] + cols[sci][(sri + 1) % 5] else: #case 3 plain_digraph = rows[fri][sci] + rows[sri][fci] else: if fri == sri: #case 1 plain_digraph = rows[fri][(fci - 1) % 5] + rows[sri][(sci - 1) % 5] elif fci == sci: #case 2 plain_digraph = cols[fci][(fri - 1) % 5] + cols[sci][(sri - 1) % 5] else: #case 3 plain_digraph = rows[fri][sci] + rows[sri][fci] out_str += plain_digraph return out_str
def getTitle(self): "Retourne le titre du document, ou None s'il n'est pas défini." if self.title is None: return None else: return clean(self.title)
def getAuthor(self): "Retourne l'auteur du document, ou None s'il n'est pas défini." if self.author is None: return None else: return clean(self.author)
def _perform(self, key, in_str): out = text.clean(in_str, key, one_case = False) return out
while len(h) < 80: h = h + '*' titles.append(h) for dt_tag,dd_tag in zip(dl.find_all('dt'),\ dl.find_all('dd')): ## parsing dt tag index = int( dt_tag.find('a').text.replace('[', '').replace(']', '')) arxivID = dt_tag.find('a', title='Abstract').text.replace('arXiv:', '') URLabs = URLbase + dt_tag.find('a', title='Abstract')['href'] URLpdf = URLbase + dt_tag.find('a', title='Download PDF')['href'] ## parsing dd tag # Title title = dd_tag.find('div', class_='list-title mathjax').text title = text.clean(title.replace('Title: ', '')) # Authors authors = [] for auth in dd_tag.find('div', class_='list-authors').find_all('a'): authors.append(author(auth.text, URLbase + auth['href'])) # Subjects subjects = dd_tag.find('div', class_='list-subjects').text subjects = subjects.replace('Subjects: ', '').split(';') subjects = [x.lstrip().rstrip() for x in subjects] # Abstract abstract = text.clean(dd_tag.find('p', class_='mathjax').text) ## arXiv entry A = arXiv_entry(title=title, authors=authors, abstract=abstract,