def learn(self, name, phrase, channel): name = self.aliases.resolve(name) if name not in self.users: self.users[name] = True if "password" in phrase: return phrase = phrase.split(" ") phrase = filter(lambda x: x and "http" not in x and "ftp:" not in x and x[0] != ".", phrase) now = datetime.datetime.utcnow() documents = [] for i in range(len(phrase) + 1): seed = UnicodeDammit.detwingle(phrase[i-1] if i > 0 else "") answer = UnicodeDammit.detwingle(phrase[i] if i < len(phrase) else "") documents.append({ "name": name, "seed": seed, "answer": answer, "added": now, "random": random.random() }) yield self.db.insert(documents, safe=True)
def create(self, soupfragment): result = dict() field = self._getfield_info(soupfragment) title = "" result["link"] = "" result["answers"] = "" result["views"] = "" result["location"] = "" if self.urlobject is not None: result["location"] = self.urlobject.description() #result['location'] = self.webclient.get_url_desc() if field is not None: title = UnicodeDammit(field.a.contents[0]).unicode_markup result["link"] = field.a['href'] fragment = self._get_answer_and_viewa_fragment(soupfragment) if fragment is not None: result["answers"] = self._get_number_from(fragment.contents[0].strip()) result["views"] = self._get_number_from(fragment.contents[2].strip()) else: print "No answer and view bloq identified in thread: ", result["link"] result["answers"] = -1 result["views"] = -1 result["title"] = title.strip() #result['next_url'] = _nextUrl(soupfragment) return result
def _sub_read(self, f): example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError(('There are duplicate feature ' + 'names in {} for example ' + '{}.').format(self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def corpus_generator(self): with open(self.corpus_path, 'rb') as f: i = 0 for line in f: line = UnicodeDammit(line.strip()).unicode_markup if line: if self.lower: line = line.lower() i += 1 if i % 100000 == 0: logging.info('Read {} nonblank lines'.format(i)) for tok in re.split(r'\s+', line): yield tok
def _sub_read(self, f): for example_num, line in enumerate(f): curr_id = '' label_map = None feat_map = None # Decode line if it's not already str if isinstance(line, bytes): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup match = self.line_regex.search(line.strip()) if not match: raise ValueError('Line does not look like valid libsvm format' '\n{}'.format(line)) # Metadata is stored in comments if this was produced by SKLL if match.group('comments') is not None: # Store mapping from feature numbers to names if match.group('feat_map'): feat_map = {} for pair in match.group('feat_map').split(): number, name = pair.split('=') for orig, replacement in \ LibSVMReader.LIBSVM_REPLACE_DICT.items(): name = name.replace(orig, replacement) feat_map[number] = name else: feat_map = None # Store mapping from label/class numbers to names if match.group('label_map'): label_map = dict(pair.split('=') for pair in match.group('label_map').strip().split()) else: label_map = None curr_id = match.group('example_id').strip() if not curr_id: curr_id = 'EXAMPLE_{}'.format(example_num) class_num = match.group('label_num') # If we have a mapping from class numbers to labels, get label if label_map: class_name = label_map[class_num] else: class_name = class_num class_name = safe_float(class_name, replace_dict=self.class_map) curr_info_dict = dict(self._pair_to_tuple(pair, feat_map) for pair in match.group('features').strip().split()) yield curr_id, class_name, curr_info_dict
def _sub_read(self, f): field_names = [] # Process ARFF header for line in f: # Process encoding if not isinstance(line, text_type): decoded_line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup else: decoded_line = line line = decoded_line.strip() # Skip empty lines if line: # Split the line using CSV reader because it can handle # quoted delimiters. split_header = self.split_with_quotes(line) row_type = split_header[0].lower() if row_type == '@attribute': # Add field name to list field_name = split_header[1] field_names.append(field_name) # Check if we're doing regression if field_name == self.label_col: self.regression = (len(split_header) > 2 and split_header[2] == 'numeric') # Save relation if specified elif row_type == '@relation': self.relation = split_header[1] # Stop at data elif row_type == '@data': break # Skip other types of rows (relations) # Create header for CSV if PY2: io_type = BytesIO else: io_type = StringIO with io_type() as field_buffer: csv.writer(field_buffer, dialect='arff').writerow(field_names) field_str = field_buffer.getvalue() # Set label_col to be the name of the last field, since that's standard # for ARFF files if self.label_col != field_names[-1]: self.label_col = None # Process data as CSV file return super(ARFFReader, self)._sub_read(chain([field_str], f))
def convert_to_libsvm(lines): ''' Converts a sequence of lines (e.g., a file or list of strings) in MegaM format to LibSVM format. :param lines: The sequence of lines to convert. :type lines: L{file} or L{list} of L{str} :return: A tuple of the newly formatted data, the mappings from class names to numbers, and the mappings from feature names to numbers. :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict}) ''' # Initialize variables field_num_dict = UniqueNumberDict() class_num_dict = UniqueNumberDict() result_list = [] # Iterate through MegaM file for line in lines: line_fields = set() # Process encoding line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip() # Ignore comments (and TEST/DEV lines) if not line.startswith('#') and not line == 'TEST' and not line == 'DEV': result_string = '' split_line = line.split() result_string += '{0}'.format(class_num_dict[split_line[0]]) # Handle features if there are any if len(split_line) > 1: del split_line[0] # Loop through all feature-value pairs printing out pairs # separated by commas (and with feature names replaced with # numbers) for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)), (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))): # Check for duplicates if field_num in line_fields: field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next() raise AssertionError("Field {} occurs on same line twice.".format(field_name)) # Otherwise output non-empty features elif value != 'N/A' and float(value): result_string += ' {}:{}'.format(field_num, value) line_fields.add(field_num) result_list.append(result_string) return result_list, class_num_dict, field_num_dict
def ramble(self, name=None, seed=""): if name: name = self.aliases.resolve(name) if name not in self.users: returnValue("") message = [] if seed: seed = UnicodeDammit.detwingle(seed) chunk = seed while chunk and len(" ".join(message)) < 300: message.append(chunk) chunk = yield self.prev(name, chunk) message.reverse() chunk = yield self.next(name, seed) while chunk and len(" ".join(message)) < 300: message.append(chunk) chunk = yield self.next(name, chunk) if not chunk and len(" ".join(message)) < 30: chunk = yield self.next(name, chunk) response = (" ".join(message)).decode("utf8") if seed and response == seed.decode("utf8"): response = yield self.ramble(name) returnValue(response)
def _fetch_data(self, entry_name, url): # url = url.decode('utf-8') # if url[:5] == 'http:': # url = 'https' + url[4:] # url = url.encode('utf-8') original_entry_name = entry_name data = dict() try: with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source: page_content = page_source.read() doc = UnicodeDammit(page_content, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.original_encoding) doc = lxml.html.document_fromstring(page_content, parser=parser) bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]') if not bar_name: bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]') if type(bar_name) is list and len(bar_name) > 0: entry_name = bar_name[0].text_content().strip() num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]') if not num_visits: num_visits = doc.xpath('//span[contains(@class, "card_menNum")]') num_posts = doc.xpath('//span[contains(@class, "j_post_num")]') if not num_posts: num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]') if type(num_visits) is list and len(num_visits) > 0: num_visits = num_visits[0].text_content() num_visits = cogtu_misc.get_first_number_from_text(num_visits) else: num_visits = 0 if type(num_posts) is list and len(num_posts) > 0: num_posts = num_posts[0].text_content() num_posts = cogtu_misc.get_first_number_from_text(num_posts) else: num_posts = 0 num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span") if type(num_groups) is list and len(num_groups) > 0: num_groups = num_groups[0].text_content() num_groups = cogtu_misc.get_first_number_from_text(num_groups) else: num_groups = 0 except urllib2.HTTPError: logging.info('urllib2.HTTPError. Skip.') return None, None except urllib2.URLError: logging.info('urllib2.URLError. Skip.') return None, None data['num_visits'] = int(num_visits) data['num_posts'] = int(num_posts) data['num_groups'] = int(num_groups) data['entry_name'] = entry_name data['original_entry_name'] = original_entry_name data['url'] = url return entry_name, data
def __init__(self,url):# logs info,warning,error,critical,debug events. ''' Description: This is the class constructor and is going to get a simple url as input and parse it based on RFC1738. Status: In Progress. Usage: This is going to be used by by the connection manager and the active/passive scanner to extract url variables. ''' self.url = UnicodeDammit.detwingle(url, 'UTF-8') self.defaultHttpsPort = 443 self.defaultHttpPort = 80 urlLogger.logInfo("--- Package: UrlManager - Module: UrlHandler Class: urlHandler Initiated ---")
def remove_evernote_link(link, html): html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup sep = u'<span style="color: rgb(105, 170, 53);"> | </span>' sep_regex = escape_regex(sep) no_start_tag_regex = r"[^<]*" regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>" # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html) # Remove link html = html.replace(link.WholeRegexMatch, "") # Remove empty li html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html) # Remove dangling separator regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex html = re.sub(regex_span, "", html) # Remove double separator html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html) return html
def selectdir(geturl): r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True) rt = UnicodeDammit.detwingle(r.text) html = BeautifulSoup(rt.decode('utf-8'), "html.parser") if debug == 1: orenc = str(html.original_encoding) print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc) findlinks = html.findAll('a') dirlist = [] for link in findlinks: b = link.get('href') if not re.match(r'^((\.\.)?\/)$', str(b)): if re.search(r'^(.*)(\/)$', str(b)): dirlist.append(b) p = urlparse(geturl) part = p.path.split('/')[-1] path = p.path.rstrip(part) if '/' not in path[:1]: path = '/' + path urlfqdn = p.scheme + '://' + p.netloc parent = urlfqdn + path i = 0 dirtotal = len(dirlist) if dirtotal > 0: print('\nFOUND %d DIRECTORIES: \n' % dirtotal) while i < dirtotal: sel = i + 1 print(str(sel) + ' - ' + str(dirlist[i])) i += 1 print('') lim = dirtotal + 1 matchtop = r'^(%s)(\/)?$' % urlfqdn if not re.match(matchtop,geturl): print('0 - BACK TO PARENT DIRECTORY \n') startsel = '0-%d' % dirtotal else: startsel = '1-%d' % dirtotal selectdir = raw_input('make a selection [%s] --> ' % startsel) if not int(selectdir) in range(0, lim): selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel) if selectdir == '0': geturl = parent subcont = 0 else: n = int(selectdir) - 1 usedir = dirlist[n] geturl = parent + usedir subcont = 1 else: print('\nNO DIRECTORIES FOUND. using current directory.. \n') subcont = 0 geturl = parent + part return geturl, subcont, parent
def clean_google_title(self, title): has_dot = False titleCleaned = UnicodeDammit(title).unicode_markup # clean step 1 # BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...' titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned) re_hasdot = re.compile("(\.\.\.|…)", re.I) match = re_hasdot.search(title) if match is not None: has_dot = True # clean step 2, here title is readable titleCleaned = re.sub("( |►|…)", "", titleCleaned) titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned) titleCleaned = titleCleaned.strip() readableTitle = titleCleaned # Shrink, only letters left titleCleaned = re.sub("\W", "", titleCleaned) titleCleaned = titleCleaned.lower() return (readableTitle, titleCleaned, has_dot)
def document_generator(path, lower=False): ''' Default document reader. Takes a path to a file with one document per line, with tokens separate by whitespace, and yields lists of tokens per document. This could be replaced by any function that yields lists of tokens. See main() for how it is called. Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode. ''' with open(path, 'rb') as f: i = 0 for line in f: line = UnicodeDammit(line.strip()).unicode_markup if line: if lower: line = line.lower() i += 1 if i % 100000 == 0: logging.info('Read {} nonblank lines'.format(i)) yield re.split(r'\s+', line)
def formatForReddit(self, feedEntry, postType, subreddit, raw): if 'content' in feedEntry: content = feedEntry['content'][0]['value'] elif 'description' in feedEntry: content = feedEntry.description else: content = '' logging.debug(content) parser = EveRssHtmlParser() title = feedEntry['title'] # some feeds like Twitter are raw so the parser hates it. if (raw): regex_of_url = '(https?:\/\/[\dA-z\.-]+\.[A-z\.]{2,6}[\/\w&=#\.\-\?]*)' title = re.sub(regex_of_url, '', title) clean_content = content.replace(' pic.twitter.com', ' http://pic.twitter.com') clean_content = re.sub(regex_of_url, '<a href="\\1">link</a>', clean_content) clean_content = UnicodeDammit.detwingle(clean_content) #logging.info(clean_content) u = UnicodeDammit(clean_content, smart_quotes_to='html', is_html = False ) # fix twitter putting ellipses on the end content = u.unicode_markup.replace(unichr(8230),' ...') logging.debug('.....') if "tumblr.com" in content: # Replace with larger images (hopefully such images exist) content = content.replace('_500.', '_1280.') # Added the .replace because the parser does something funny to them and # removes them before I can handle them content = content.replace(' ', ' ') content = content.replace('•', '*').replace('·','*') content = content.replace('“','\'').replace('”','\'') content = re.sub('( [ ]+)', ' ', content) parser.feed(content) parser.comments[0] = '%s\n\n%s' %(feedEntry['link'], parser.comments[0]) parser.comments[-1] += self.config['signature'] if 'author' in feedEntry: author = '~' + feedEntry['author'].replace('@', ' at ') else: author = '' return {'comments': parser.comments, 'link': feedEntry['link'], 'subreddit': subreddit, 'title': '[%s] %s %s' %(postType, title, author)}
def getContent(soup, source=''): newContent = [] # Cleanning phase genericCleaning(soup) sourceSpecificcleaning(soup, source) # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close(); # Finding content in the tree bestElem = None; bestText = ''; for el in soup.findAll(True): score = 0.0; hasTitle = False if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]': score += 3 for c in el: if c.name == 'br': # business insider style score += 0.5 if c.name == 'p': score += 1.0 if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']: score += 1.0 hasTitle = True if score >= 3.0: # at least 3 paragraphs textOutput = getText(el) if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container newContent.append(textOutput) elif score >= 1.0: if bestElem is None: bestElem = el; bestText = getText(el, False) else: a = getText(el, False) if bestElem is None or len(a) > len(bestText): bestElem = el; bestText = a if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more newContent.append(bestText) finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup return finalText.replace('\n\n', '\n')
def normalize(s): if isinstance(s, unicode): return s try: u = s.decode("utf8") except: try: u = (s[:-1]).decode("utf8") except: try: u = UnicodeDammit.detwingle(s).decode("utf8") except: u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup return u
def getpage(cfurl): r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True) if 'text' in r.headers.get('Content-Type'): rt = UnicodeDammit.detwingle(r.text) html = BeautifulSoup(rt.decode('utf-8'), "html.parser") print('\r\n--------------------------------------------------------\r\n') if debug == 1: orenc = str(html.original_encoding) print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc) bs = html.prettify(formatter=None) print(bs) print('\r\n--------------------------------------------------------\r\n') else: found = -1 if debug == 1: print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
def format(self, script): dammit = UnicodeDammit.detwingle(script) soup = BeautifulSoup(dammit, from_encoding="utf8") header = soup.find('subtitle_script') header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n"; styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"; events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"; stylelist = soup.findAll('style') eventlist = soup.findAll('event') for style in stylelist: styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n" for event in eventlist: events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n" formattedSubs = header+styles+events return formattedSubs
def to_unicode(data, is_html=False, detwingle=False, verbose=True, lang=None): " converts everything to unicode" dammit = UnicodeDammit(data, is_html=is_html) if detwingle and dammit.original_encoding == 'windows-1252': new_data = UnicodeDammit.detwingle(data) dammit = UnicodeDammit(new_data, is_html=is_html) if verbose: sys.stderr.write("Original encoding (via BS): %s\n" % (dammit.original_encoding)) if lang is None: return dammit.unicode_markup if lang == 'auto': lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html) if verbose: sys.stderr.write("Detected language: %s\n" % (lang)) return _to_unicode_chared(data, lang, verbose=verbose)
def unicode_dammit_example(): # Install the 'chardet' or 'cchardet' Python libraries for better guesses ### Take a string with unknown encoding and make the string Unicode weirdass_string = "Sacr\xc3\xa9 bleu!" dammit = UnicodeDammit(weirdass_string) print "Original Word with weird encoding:", weirdass_string print "Dammit Print:", (dammit.unicode_markup) print "Dammit Type:", (dammit.original_encoding) ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle() snowmen = (u"\N{SNOWMAN}" * 3) quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") doc = snowmen.encode("utf8") + quote.encode("windows-1252") # So now we have one doc with two encodings in it, printing is a mess #print "Weird Decoding doc with utf8:", doc # messed up, won't print #print (doc.decode("windows-1252")) # So messed up it doesn't even print # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8 new_doc = UnicodeDammit.detwingle(doc) print new_doc.decode("utf8")
def getlinks(cfurl): r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True) rt = UnicodeDammit.detwingle(r.text) html = BeautifulSoup(rt.decode('utf-8'), "html.parser") if debug == 1: orenc = str(html.original_encoding) print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc) bs = html.prettify(formatter=None) linkresult = html.findAll('a') if len(linkresult) > 0: foundlinks = len(linkresult) print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl)) for link in linkresult: b = link.get('href') b = str(b) if b not in cfurl and not re.match(r'^(\.\.)?\/$', b): print(b) print('') else: print('\nNO LINKS FOUND.\n') foundlinks = 0 time.sleep(4) return foundlinks
def to_unicode(data, is_html=False, detwingle=False, verbose=False, lang=None): """ Produce unicode from text of unknown encoding. Input: bytestring """ dammit = UnicodeDammit(data, is_html=is_html) if detwingle and dammit.original_encoding == 'windows-1252': new_data = UnicodeDammit.detwingle(data) dammit = UnicodeDammit(new_data, is_html=is_html) if verbose: sys.stderr.write("Original encoding (via BS): %s\n" % (dammit.original_encoding)) if lang is None: return dammit.unicode_markup if lang == 'auto': lang = TextSanitizer.guess_lang_from_data( dammit.unicode_markup, is_html=is_html) if verbose: sys.stderr.write("Detected language: %s\n" % (lang)) return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
def followlinks(bx): p = urlparse(bx) if '/' not in p.path[-1:]: part = p.path.split('/')[-1] path = p.path.rstrip(part) else: path = p.path if '/' not in path[:1]: path = '/' + path urlfqdn = p.scheme + '://' + p.netloc parent = urlfqdn + path + '/' s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True) print('\n----------------------------------------------------------- \n') print(s) print('\n') scr = UnicodeDammit.detwingle(s.text) shtml = BeautifulSoup(scr, "html.parser") if debug == 1: orenc = str(shtml.original_encoding) print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc) print('\n----------------------------------------------------------- \n') sfindlinks = shtml.findAll('a') slen = len(sfindlinks) sdirs = [] si = 0 while si < slen: for slink in sfindlinks: if debug == 1: print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen)) sl = slink.get('href') si += 1 if sl: if not re.search(r'^((\.\.)?\/)$', str(sl)): if '/' in bx[-1:]: if 'http' not in sl[:4]: sl = sl.lstrip('/') sx = bx + sl else: sx = sl print(sx) getCF(sx, 0) ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True) bs = BeautifulSoup(ss.text, "html.parser") if bs is not None: if debug == 1: orenc = str(bs.original_encoding) print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc) pagehead = bs.html.head.contents pagehead = str(pagehead) if pagehead: pagetitle = re.search(r'<title>(.*)<\/title>', pagehead) pagetitle = str(pagetitle.group(1)) bigtitle = pagetitle.upper() titlestars = lambda a: '*' * (len(str(a)) + 4) pagestars = titlestars(pagetitle) print('\n\033[40m\033[33m%s\n\033[34;1m* %s * \n\033[40m\033[33;21m%s\n\033[0m' % (pagestars, bigtitle, pagestars)) sb = bs.find_all('a', href = re.compile(r'.+$')) #sb = bs.findAll('a') sblen = len(sb) if sblen > 0: n = 0 while n < sblen: for sbl in sb: if debug == 1: print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen)) if sbl is not None: sr = sbl.get('href').strip() sr = str(sr) print('\n* %s \n') % sr if not re.search('http', sr[:4]): parent = getparent(sx) srs = sr.lstrip('/') sr = parent + srs if re.match(r'([^.]+\/)$', str(sr)): followlinks(sr) sdirs.append(sr) else: if '/' not in sr[-1:]: getCF(sr, 0) sdirs.append(sr) n += 1 else: n += 1 continue elif 'Error-222' in bx: print('\nuh-oh. might have triggered a flag with cloudflare.\n') for i in xrange(10,0,-1): time.sleep(1) print('delaying request for %d seconds.. \r' % i) sys.stdout.flush() break else: if not re.search('http', str(sl[:4])): parent = getparent(bx) sl = sl.lstrip('/') sx = parent + sl else: sx = str(sl) sx = str(sx) sdirs.append(sx) print(sx) print('\n----------------------------------------------------------- \n') getCF(sx, 0) si += 1 #if re.search(r'^(.*)(\/)$', str(bx)): else: print('\nno links found at %s \n' % str(slink)) si += 1 continue for sd in sdirs: if '/' in sd[-1:]: print('\nfollowing directory: %s \n' % sd) followlinks(sd) getCF(sd, 1) else: print('\nrequesting link: %s \n' % sd) getCF(sd, 0) return sdirs
def _sub_read(self, f): """ Parameters ---------- f : file buffer A file buffer for the ARFF file. Yields ------ curr_id : str The current ID for the example. class_name : float or str The name of the class label for the example. example : dict The example valued in dictionary format, with 'x' as list of features. """ field_names = [] # Process ARFF header for line in f: # Process encoding if not isinstance(line, text_type): decoded_line = UnicodeDammit( line, ['utf-8', 'windows-1252']).unicode_markup else: decoded_line = line line = decoded_line.strip() # Skip empty lines if line: # Split the line using CSV reader because it can handle # quoted delimiters. split_header = self.split_with_quotes(line) row_type = split_header[0].lower() if row_type == '@attribute': # Add field name to list field_name = split_header[1] field_names.append(field_name) # Check if we're doing regression if field_name == self.label_col: self.regression = (len(split_header) > 2 and split_header[2] == 'numeric') # Save relation if specified elif row_type == '@relation': self.relation = split_header[1] # Stop at data elif row_type == '@data': break # Skip other types of rows (relations) # Create header for CSV if PY2: io_type = BytesIO else: io_type = StringIO with io_type() as field_buffer: csv.writer(field_buffer, dialect='arff').writerow(field_names) field_str = field_buffer.getvalue() # Set label_col to be the name of the last field, since that's standard # for ARFF files if self.label_col != field_names[-1]: self.label_col = None # Process data as CSV file return super(ARFFReader, self)._sub_read(chain([field_str], f))
def _sub_read(self, f): """ Parameters ---------- f : file buffer A file buffer for an MegaM file. Yields ------ curr_id : str The current ID for the example. class_name : float or str The name of the class label for the example. example : dict The example valued in dictionary format, with 'x' as list of features. Raises ------ ValueError If there are duplicate feature names. """ example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError( ('There are duplicate feature ' + 'names in {} for example ' + '{}.').format( self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=None, formats=("srt",), tags=None, path_decoder=None, debug_mods=False): """Save subtitles on filesystem. Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles with the same language are silently ignored. The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle. :param file_path: video file path :param formats: list of "srt" and "vtt" :param subtitles: subtitles to save. :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` :param bool single: save a single subtitle, default is to save one subtitle per language. :param str directory: path to directory where to save the subtitles, default is next to the video. :return: the saved subtitles :rtype: list of :class:`~subliminal.subtitle.Subtitle` patch: unicode path problems """ logger.debug("Subtitle formats requested: %r", formats) saved_subtitles = [] for subtitle in subtitles: # check content if subtitle.content is None: logger.error('Skipping subtitle %r: no content', subtitle) continue # check language if subtitle.language in set(s.language for s in saved_subtitles): logger.debug('Skipping subtitle %r: language already saved', subtitle) continue # create subtitle path subtitle_path = get_subtitle_path(file_path, None if single else subtitle.language, forced_tag=subtitle.language.forced, tags=tags) if directory is not None: subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1]) if path_decoder: subtitle_path = path_decoder(subtitle_path) # force unicode subtitle_path = UnicodeDammit(subtitle_path).unicode_markup subtitle.storage_path = subtitle_path for format in formats: if format != "srt": subtitle_path = os.path.splitext(subtitle_path)[0] + (u".%s" % format) logger.debug(u"Saving %r to %r", subtitle, subtitle_path) content = subtitle.get_modified_content(format=format, debug=debug_mods) if content: with open(subtitle_path, 'w') as f: f.write(content) subtitle.storage_path = subtitle_path else: logger.error(u"Something went wrong when getting modified subtitle for %s", subtitle) # change chmod if requested if chmod: os.chmod(subtitle_path, chmod) saved_subtitles.append(subtitle) # check single if single: break return saved_subtitles
def replace_cid_and_change_headers(html, pk): """ Check in the html source if there is an image tag with the attribute cid. Loop through the attachments that are linked with the email. If there is a match replace the source of the image with the cid information. After read the image information from the disk and put the data in a dummy header. At least create a plain text version of the html email. Args: html (string): HTML string of the email body to be sent. mapped_attachments (list): List of linked attachments to the email. request (instance): The Django request. Returns: body_html (string), body_text (string), dummy_headers (dict) """ if html is None: return None dummy_headers = [] inline_images = [] soup = create_a_beautiful_soup_object(html) attachments = [] if pk: attachments = EmailAttachment.objects.filter(message_id=pk) if soup and attachments: inline_images = soup.findAll('img', {'cid': lambda cid: cid}) if (not soup or soup.get_text() == '') and not inline_images: body_html = html else: cid_done = [] for image in inline_images: image_cid = image['cid'] for file in attachments: if (file.cid[1:-1] == image_cid or file.cid == image_cid) and file.cid not in cid_done: image['src'] = "cid:%s" % image_cid storage_file = default_storage._open(file.attachment.name) filename = get_attachment_filename_from_url( file.attachment.name) if hasattr(storage_file, 'key'): content_type = storage_file.key.content_type else: content_type = mimetypes.guess_type( storage_file.file.name)[0] storage_file.open() content = storage_file.read() storage_file.close() response = { 'content-type': content_type, 'content-disposition': 'inline', 'content-filename': filename, 'content-id': file.cid, 'x-attachment-id': image_cid, 'content-transfer-encoding': 'base64', 'content': content } dummy_headers.append(response) cid_done.append(file.cid) del image['cid'] body_html = soup.encode_contents() body_text_handler = html2text.HTML2Text() body_text_handler.ignore_links = True body_text_handler.body_width = 0 body_text = body_text_handler.handle(html) # After django 1.11 update forcing the html part of the body to be unicode is needed to avoid encoding errors. dammit = UnicodeDammit(body_html) encoding = dammit.original_encoding if encoding: body_html = body_html.decode(encoding) return body_html, body_text, dummy_headers
def parse_more(page_tree): rmore = '' if page_tree.xpath('//div[@id="s_notes"]'): page = page_tree.xpath('//div[@id="s_notes"]')[0] rows = page.xpath('.//tr[@height="45px"]') for row in rows: more = row.xpath('.//td/text()')[0].strip() if more != "": rmore = rmore + more + "\n" return rmore lf_data = [] r = rr_sess.post(searchURL, data=payload) html_ud = UnicodeDammit(r.content, is_html=True) parser = html.HTMLParser(encoding=html_ud.original_encoding) tree = html.document_fromstring(r.content, parser=parser) rows_number = int(tree.xpath('//div[@id="pg_stats"]/b[1]/text()')[0].strip()) pages_number = -(-rows_number // 20) print 'Всего объектов: %d' % (rows_number) parse_search(tree) print "Получил страницу 1 из %d" % (pages_number) if rows_number > 20: for pn in range(2, pages_number + 1): r = rr_sess.get(searchURL + ur'?online_request_search_page=' + str(pn) + ur'#Z7_01HA1A42KG4D30A3BUVH3O0000') tree = html.document_fromstring(r.content, parser=parser) parse_search(tree) print "Получил страницу %d из %d" % (pn, pages_number) with open(house_path + u"/Квартиры.csv", "wb") as f:
def get_data(site_code): url = config.get( 'DEFAULTS', 'weather_data_url_prefix') + '/' + site_code.upper() + config.get( 'DEFAULTS', 'weather_data_url_file_extension') logger.debug('retrieval url: %s' % (url)) # Make soup try: resp = urlopen(url) LastRetrieval = datetime.strptime(resp.headers['Date'], '%a, %d %b %Y %H:%M:%S %Z') LastModified = datetime.strptime(resp.headers['Last-Modified'], '%a, %d %b %Y %H:%M:%S %Z') logger.debug('web page timestamp: Last-Modified: ' + resp.headers['Last-Modified']) contents = resp.read() new_contents = UnicodeDammit.detwingle(contents) soup = BeautifulSoup(new_contents, "html.parser") except URLError as e: logger.warn('An error occurred fetching data\n\t%s\n\t%s' % (url, e.reason)) return {} # Get table try: tables = soup.findAll("table") table = tables[3] except AttributeError as e: logger.warn('No tables found, exiting' % (url, e.reason)) return 1 except LookupError as e: logger.warn('there is no index table[3] on the page for ' + url) return 1 except IndexError as e: logger.warn('there is no index table[3] on the page for ' + url) return 1 # Get rows try: rows = table.find_all('tr') except AttributeError as e: logger.warn('No table rows found, exiting' % (url, e.reason)) return 1 # first two columns are created from the table table_columns = out_file_columns[3:len(out_file_columns)] # Get data table_data = parse_rows(rows) # prepare the data read from the web page today = datetime.now() month = today.month year = today.year monthedge = 0 data_rows = {} for i in table_data: data = dict(zip(table_columns, i)) day = data['Date'] # this gets over month/year edges. if int(day) <= 2 and monthedge == 0: monthedge = 1 hour, minute = data['Time'].split(':') my_month = -1 # this gets over month/year edges. if int(day) > 2 and monthedge == 1: my_month = month - 1 # the month is coming from 'localtime' not the webpage if my_month == 0: # january fix my_month = 12 year = year - 1 else: my_month = month obs_datetime = datetime(year, my_month, int(day), int(hour), int(minute)) data['site_code'] = site_code.upper() data['DateTime'] = obs_datetime.strftime('%Y-%m-%d %H:%M:00') data['TIMESTAMP'] = 'TS:' + data['DateTime'] # these fields are stored in the database as numbers, but the web pages use 'NA' for missing data. that string needs to be replaced with None check_field_values = ['AirTemp', 'Dewpoint', 'AirPressureAltimeter'] for field in check_field_values: if data[field] == 'NA': data[field] = None elif not data[field]: data[field] = None data_rows[data['TIMESTAMP']] = data return [LastRetrieval, LastModified, data_rows]
def decode_html(html_string): converted = UnicodeDammit(html_string, isHTML=True) if not converted.unicode: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) return converted.unicode
def extract_person_profile(hxs): personProfile = PersonProfileItem() ## Person name nameField = {} nameSpan = hxs.select("//span[@id='name']/span") if nameSpan and len(nameSpan) == 1: nameSpan = nameSpan[0] givenNameSpan = nameSpan.select("span[@class='given-name']") if givenNameSpan and len(givenNameSpan) == 1: givenNameSpan = givenNameSpan[0] nameField['given_name'] = givenNameSpan.select("text()").extract()[0] familyNameSpan = nameSpan.select("span[@class='family-name']") if familyNameSpan and len(familyNameSpan) == 1: familyNameSpan = familyNameSpan[0] nameField['family_name'] = familyNameSpan.select("text()").extract()[0] personProfile['name'] = nameField else: return None headline = hxs.select("//dl[@id='headline']") if headline and len(headline) == 1: headline = headline[0] ## locality locality = headline.select("dd/span[@class='locality']/text()").extract() if locality and len(locality) == 1: personProfile['locality'] = locality[0].strip() ## industry industry = headline.select("dd[@class='industry']/text()").extract() if industry and len(industry) == 1: personProfile['industry'] = industry[0].strip() ## overview overview = hxs.select("//dl[@id='overview']").extract() if overview and len(overview) == 1: personProfile['overview_html'] = overview[0] homepage = LinkedinParser.parse_homepage(overview[0]) if homepage: personProfile['homepage'] = homepage ## summary summary = hxs.select("//div[@id='profile-summary']/div[@class='content']/p[contains(@class,'summary')]/text()").extract() if summary and len(summary) > 0: personProfile['summary'] = ''.join(x.strip() for x in summary) ## specilities specilities = hxs.select("//div[@id='profile-specialties']/p/text()").extract() if specilities and len(specilities) == 1: specilities = specilities[0].strip() personProfile['specilities'] = specilities ## skills skills = hxs.select("//ol[@id='skills-list']/li/span/a/text()").extract() if skills and len(skills) > 0: personProfile['skills'] = [x.strip() for x in skills] additional = hxs.select("//div[@id='profile-additional']") if additional and len(additional) == 1: additional = additional[0] ## interests interests = additional.select("div[@class='content']/dl/dd[@class='interests']/p/text()").extract() if interests and len(interests) == 1: personProfile['interests'] = interests[0].strip() ## groups g = additional.select("div[@class='content']/dl/dd[@class='pubgroups']") if g and len(g) == 1: groups = {} g = g[0] member = g.select("p/text()").extract() if member and len(member) > 0: groups['member'] = ''.join(member[0].strip()) gs = g.select("ul[@class='groups']/li[contains(@class,'affiliation')]/div/a/strong/text()").extract() if gs and len(gs) > 0: groups['affilition'] = gs personProfile['group'] = groups ## honors honors = additional.select("div[@class='content']/dl/dd[@class='honors']/p/text()").extract() if honors and len(honors) > 0: personProfile['honors'] = [x.strip() for x in honors] ## education education = hxs.select("//div[@id='profile-education']") schools = [] if education and len(education) == 1: education = education[0] school_list = education.select("div[contains(@class,'content')]//div[contains(@class,'education')]") if school_list and len(school_list) > 0: for school in school_list: s = {} name = school.select("h3[contains(@class,'org')]/text()").extract() if name and len(name) == 1: s['name'] = name[0].strip() degree = school.select("h4[@class='details-education']/span[@class='degree']/text()").extract() if degree and len(degree) == 1: s['degree'] = degree[0].strip() major = school.select("h4[@class='details-education']/span[@class='major']/text()").extract() if major and len(major) == 1: s['major'] = major[0].strip() period = school.select("p[@class='period']") if period and len(period) == 1: period = period[0] start = period.select("abbr[@class='dtstart']/text()").extract() end = period.select("abbr[@class='dtend']/text()").extract() if len(start) == 1: s['start'] = start[0] if len(end) == 1: s['end'] = end[0] desc = school.select("p[contains(@class,'desc')]/text()").extract() if len(desc) == 1: s['desc'] = desc[0].strip() schools.append(s) personProfile['education'] = schools ## experience experience = hxs.select("//div[@id='profile-experience']") if experience and len(experience) == 1: es = [] experience = experience[0] exps = experience.select("//div[contains(@class,'experience')]") if len(exps) > 0: for e in exps: je = {} title = e.select("div[@class='postitle']//span[@class='title']/text()").extract() if len(title) > 0: je['title'] = title[0].strip() org = e.select("div[@class='postitle']//span[contains(@class,'org')]/text()").extract() if len(org) > 0: je['org'] = org[0].strip() start = e.select("p[@class='period']/abbr[@class='dtstart']/text()").extract() if len(start) > 0: je['start'] = start[0].strip() end = e.select("p[@class='period']/abbr[@class='dtstamp']/text()").extract() if len(end) > 0: je['end'] = end[0].strip() location = e.select("p[@class='period']/abbr[@class='location']/text()").extract() if len(location) > 0: je['location'] = location[0] desc = e.select("p[contains(@class,'description')]/text()").extract() if len(desc) > 0: je['desc'] = "".join(x.strip() for x in desc) es.append(je) personProfile['experience'] = es ## Also view alsoViewProfileList = [] divExtra = hxs.select("//div[@id='extra']") if divExtra and len(divExtra) == 1: divExtra = divExtra[0] divAlsoView = divExtra.select("//div[@class='leo-module mod-util browsemap']") if divAlsoView and len(divAlsoView) == 1: divAlsoView = divAlsoView[0] alsoViewList = divAlsoView.select("div[@class='content']/ul/li/strong/a/@href").extract() if alsoViewList: for alsoViewItem in alsoViewList: alsoViewItem = UnicodeDammit(alsoViewItem).markup item = HtmlParser.get_also_view_item(alsoViewItem) alsoViewProfileList.append(item) personProfile['also_view'] = alsoViewProfileList return personProfile
def decode_html(html_string): converted = UnicodeDammit(html_string) if not converted.unicode_markup: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",', '.join(converted.tried_encodings)) # print converted.original_encoding return converted.unicode_markup
def on_pubmsg(self, c, e): nick = e.source.nick target = e.target if is_channel(e.target) else nick def reply(msg): self.send(target, msg) def dm(msg): self.send(nick, msg) line = UnicodeDammit(e.arguments[0]).unicode_markup log(' \033[37m{}→{}\033[0m'.format(nick, line)) a = line.split(":", 1) if len(a) > 1 and a[0].lower() == self.nick: self.do_command(e, a[1].strip().lower(), nick, target, reply, dm) return # zeltofilter if 'zeltoph' in nick: return foo = settings.VIPS.get(nick, 0) if random() < foo: self.kick(nick) match = re.match('.*┻━┻.*', line) if match: reply('┬─┬ノ(ಠ_ಠノ)') return match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line) if match: newcs = match.group(3) self.chaossternchen.append(newcs) self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs)) return if line.startswith('.wiki '): wikipage = line[len('.wiki '):].strip() if re.match('^[-_+\w]+$', wikipage): wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage) if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text: reply("I'm sorry, I can't find a wiki page with that name.") else: reply(wikiurl) else: reply('Try to troll somebot else.') return if line == 'wat?': reply("I don't have a clue.") return if re.match('^hail eris[.!]* ', line.lower()): reply("All Hail Discordia!") return m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE) for _1,match,_2 in m: if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE): self.kick(nick, "It's spelled Gandhi") return if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()): reply('A facebook link? srsly? Get some self-respect!') return match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower()) if match: reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1)) return if line == 'moin': self.moincount += 1 if self.moincount == 5: reply('moin') return else: self.moincount = 0 if line.lstrip('.!#').startswith('eta '): eta = line[4:].strip() with self.db as db: db.execute("DELETE FROM etas WHERE nick=?", (nick,)) if eta: db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta)) dm('ETA registered. Thanks!') return m = re.findall(URL_REGEX, line.lower()) for url,*_ in m: res = requests.get(url) if res.status_code == requests.codes.ok: soup = BeautifulSoup(res.text) reply(soup.title.string) m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE) for _1,match,_2 in m: if match != 'AfRA' and match != 'afra' and random() < 0.1: reply("I'm sure you meant AfRA, not "+match) return
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') label_group = parser.add_mutually_exclusive_group(required=False) label_group.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') label_group.add_argument('--no_labels', action='store_true', default=False, help='Used to indicate that the input data has no labels.') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None label_col = None if args.no_labels else args.label_col # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is CSVWriter or writer_type is TSVWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
def end_text(): text_of_qoute = UnicodeDammit( str(all_quotes[random.randint(1, len(all_quotes)) - 1].find( "div", class_="text"))[18:-6].replace("<br/>", "\n").replace( "<br>", "").replace("</br>", "")) return text_of_qoute.unicode_markup
from bs4 import BeautifulSoup from bs4 import UnicodeDammit import urllib.request url = "http://www.weather.com.cn/weather/101190301.shtml" try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" } req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req) data = data.read() dammit = UnicodeDammit(data, ["utf-8", "gbk"]) data = dammit.unicode_markup soup = BeautifulSoup(data, 'lxml') lis = soup.select("ul[class='t clearfix'] li") for li in lis: try: data = li.select('h1')[0].text weather = li.select('p[class="wea"]')[0].text temp = li.select('p[class="tem"] span')[0].text + "/" + li.select( 'p[class="tem"] i')[0].text print(data, weather, temp) except Exception as err: print(err) except Exception as err: print(err)
def _whois_domain(self, param): config = self.get_config() server = config.get(phantom.APP_JSON_SERVER, None) domain = param[phantom.APP_JSON_DOMAIN] action_result = self.add_action_result(ActionResult(dict(param))) action_result.set_param({phantom.APP_JSON_DOMAIN: domain}) # This sleep is required between two calls, else the server might # throttle the queries when done in quick succession, which leads # to a 'Connection reset by peer' error. # Sleep before doing anything (as opposed to after), so that even # if this action returns an error, the sleep will get executed and # the next call will get executed after this sleep time.sleep(1) try: domain = self._get_domain(domain) except Exception as e: error_message = self._get_error_message_from_exception(e) return action_result.set_status(phantom.APP_ERROR, WHOIS_ERR_PARSE_INPUT, error_message) self.debug_print("Validating/Querying Domain {0}".format(repr(domain))) action_result.update_summary({phantom.APP_JSON_DOMAIN: domain}) self.save_progress("Querying...") pythonwhois.parse.registrant_regexes.extend(REGISTRANT_REGEXES) pythonwhois.parse.admin_contact_regexes.extend(ADMIN_CONTACT_REGEXES) pythonwhois.parse.tech_contact_regexes.extend(TECH_CONTACT_REGEXES) pythonwhois.parse.billing_contact_regexes.extend(BILLING_CONTACT_REGEXES) # 1. Attempting to fetch the whois information with the server # if provided or without it if not provided whois_response = self._fetch_whois_info(action_result, domain, server) if whois_response is None: return action_result.get_status() # 2. Attempting to fetch the whois information with the server obtained # in the output response of the first step above if whois_response.get('contacts') and not whois_response.get('contacts').get('registrant'): if whois_response.get('whois_server'): resp_server = UnicodeDammit(whois_response.get('whois_server')[0]).unicode_markup.encode('utf-8') whois_response = self._fetch_whois_info(action_result, domain, resp_server) if whois_response is None: return action_result.get_status() else: self.debug_print("No second API call required as the server information could not be fetched from the first WHOIS API call") self.save_progress("Parsing response") try: # Need to work on the json, it contains certain fields that are not # parsable, so will need to go the 'fallback' way. # TODO: Find a better way to do this whois_response = json.dumps(whois_response, default=_json_fallback) whois_response = json.loads(whois_response) action_result.add_data(whois_response) except Exception as e: error_message = self._get_error_message_from_exception(e) return action_result.set_status(phantom.APP_ERROR, WHOIS_ERR_PARSE_REPLY, error_message) # Even if the query was successfull the data might not be available if self._response_no_data(whois_response, domain): return action_result.set_status(phantom.APP_SUCCESS, '{}, but, {}.'.format(WHOIS_SUCC_QUERY, WHOIS_ERR_QUERY_RETURNED_NO_CONTACTS_DATA)) else: # get the registrant if whois_response.get('contacts') and whois_response.get('contacts').get('registrant'): registrant = whois_response['contacts']['registrant'] wanted_keys = ['organization', 'name', 'city', 'country'] summary = {x: registrant[x] for x in wanted_keys if x in registrant} action_result.update_summary(summary) action_result.set_status(phantom.APP_SUCCESS) else: action_result.set_status(phantom.APP_SUCCESS, '{}, but, {}.'.format(WHOIS_SUCC_QUERY, WHOIS_SUCC_QUERY_RETURNED_NO_REGISTRANT_DATA)) return phantom.APP_SUCCESS
img=[] links=[] imagelinks=[] imagewidths=[] imageheights=[] con=pymysql.connect(host='localhost', user='******', password='******', db='myFlaskApp', charset='utf8', cursorclass=pymysql.cursors.DictCursor) cur=con.cursor() cur.execute('DROP TABLE IF EXISTS datacities') cur.execute('CREATE TABLE datacities(id INT(11) AUTO_INCREMENT PRIMARY KEY,heading VARCHAR(1000),link VARCHAR(1000),imagelink VARCHAR(1000),imagewidth INT(11),imageheight INT(11),article VARCHAR(13000))') for i in range(0,total_num-1): dammit=UnicodeDammit(title[i].get_text()) headings.append(dammit.unicode_markup) links.append(title[i].find('a')['href']) img=temp.find_all('div',{'class':'snaps'}) imagelinks.append(img[i].find('img')['data-lazy-src']) imageheights.append(img[i].find('img')['height']) imagewidths.append(img[i].find('img')['width']) reqpage=requests.get(links[i]) reqsoup=BeautifulSoup(reqpage.content,'html.parser') yo=reqsoup.find('div',{'class':'articles'}).findAll('p') length=len(yo) mypara='' for j in range(0,length-1): dammit=UnicodeDammit(yo[j].get_text().encode('utf8')) pp=str(dammit.unicode_markup) mypara=str(mypara+"\n"+pp)
def maybe_convert(record, domain): """Converts a WARC record to JSON rows for the Page and Html tables. Arg: record: warcio.Record domain: string Returns: dict, JSON Page record, or None """ if record.rec_type != 'response': return if (record.http_headers.get_statuscode() != '200' or not record.http_headers.get('Content-Type', '').startswith('text/html')): return url = record.rec_headers.get('WARC-Target-URI') if url in seen_urls or blacklist.URL_BLACKLIST_RE.search(url): return assert domain url_domain = urlparse(url).netloc.split(':')[0] if url_domain != domain and not url_domain.endswith('.' + domain): return row = { 'domain': url_domain, 'url': url, 'fetch_time': record.rec_headers.get('WARC-Date'), 'rels': [], # placeholders so that key order is preserved 'u_urls': [], 'mf2_classes': [], 'mf2': '{}', 'headers': [{ 'name': name, 'value': value } for name, value in sorted(record.http_headers.headers)], } content_length = record.http_headers.get('Content-Length') if content_length and int(content_length) > MAX_ROW_SIZE: row.update({ 'html': MAX_ROW_MESSAGE, 'mf2': json.dumps({MAX_ROW_MESSAGE: None}), }) return row # TODO: charset from HTTP header Content-Type # # use UnicodeDammit to gracefully handle response contents with invalid # content for their character encoding, e.g. invalid start or continuation # bytes in UTF-8. body_bytes = record.content_stream().read() body = UnicodeDammit(body_bytes).unicode_markup if not body: return if url in seen_urls: return seen_urls.add(url) soup = BeautifulSoup(body, 'lxml') links = [ { 'tag': link.name, 'url': link['href'], 'inner_html': ''.join(str(c) for c in link.children), # inner HTML content 'rels': link.get('rel', []), 'classes': link.get('class', []), } for link in soup.find_all('link') + soup.find_all('a') if link.get('href') ] row.update({ 'links': links[:MAX_LINKS], # heuristic: check that HTML is <= 1/2 max size to avoid cost of serializing # this whole JSON object just to check its length. 'html': body if len(body_bytes) <= MAX_ROW_SIZE / 2 else MAX_ROW_MESSAGE, }) try: mf2 = mf2py.parse(url=url, doc=soup) except Exception as e: print('mf2py.parse with lxml failed on %s; switching to html5lib: %s' % (url, e)) try: mf2 = mf2py.parse(url=url, doc=BeautifulSoup(body, 'html5lib')) except Exception as e2: print('mf2py.parse with html5lib failed too, giving up: %s' % e2) return row def mf2_classes(obj): if isinstance(obj, (list, tuple)): return sum((mf2_classes(elem) for elem in obj), []) elif isinstance(obj, dict): items = obj.get('items') or obj.get('children') or [] return obj.get('type', []) + mf2_classes(items) raise RuntimeError('unexpected type: %r' % obj) mf2_str = json.dumps(mf2 or {}) row.update({ 'rels': [{ 'value': val, 'urls': urls } for val, urls in mf2.get('rels', {}).items()], 'u_urls': get_urls(mf2.get('items', [])), 'mf2_classes': sorted(set(mf2_classes(mf2))), 'mf2': (mf2_str if len(mf2_str) <= MAX_ROW_SIZE / 2 else json.dumps( {MAX_ROW_MESSAGE: None})), }) return row
async def scrape_page(page, feed_id, loop): # Connect to database session = models.Session() print('Scrape initiated for page ' + str(page) + ' of Hacker News.') # Get current UTC time in seconds now = int(datetime.utcnow().strftime('%s')) # Get HTML tree from feed page feed_html = requests.get('https://news.ycombinator.com/news?p=' + str(page)) feed_content = feed_html.content feed_soup = BeautifulSoup(feed_content, 'html.parser') # Get all post rows from HTML tree post_rows = feed_soup.find_all('tr', 'athing') for post_row in post_rows: # Get subtext row with additional post data subtext_row = post_row.next_sibling # Get post id post_id = post_row.get('id') # Check if post exists in database post_exists = session.query( models.Post.id).filter_by(id=post_id).scalar() # Get core post data if it is not in database already if not post_exists: # Get UTC timestamp for post's posting time by subtracting the # number of days/hours/minutes ago given on the webpage from the # current UTC timestamp time_unit = subtext_row.find('span', 'age').a.get_text().split()[1] if 'day' in time_unit: created = now - 86400 * int( subtext_row.find('span', 'age').a.get_text().split()[0]) elif 'hour' in time_unit: created = now - 3600 * int( subtext_row.find('span', 'age').a.get_text().split()[0]) else: created = now - 60 * int( subtext_row.find('span', 'age').a.get_text().split()[0]) created = time.strftime('%Y-%m-%d %H:%M', time.localtime(created)) # Get post's link link = post_row.find('a', 'storylink').get('href') # Get post's title title = post_row.find('a', 'storylink').get_text() # Set post's type based on title if 'Show HN:' in title: type = 'show' elif 'Ask HN:' in title: type = 'ask' else: type = 'article' # Get username of user who posted post or set as blank for job # posting if subtext_row.find('a', 'hnuser'): username = subtext_row.find('a', 'hnuser').get_text() else: username = '' # Get website that post is from or set as blank for ask posting if post_row.find('span', 'sitestr'): website = post_row.find('span', 'sitestr').get_text() else: website = '' # Add post data to database post = models.Post(created=created, id=post_id, link=link, title=title, type=type, username=username, website=website) session.add(post) # Get post's comment count if it is listed (otherwise, set to 0) if 'comment' in subtext_row.find_all(href='item?id=' + post_id)[-1].get_text(): unicode_count = UnicodeDammit( subtext_row.find_all(href='item?id=' + post_id)[-1].get_text()) comment_count = unicode_count.unicode_markup.split()[0] else: comment_count = 0 # Get post's rank on feed page feed_rank = post_row.find('span', 'rank').get_text()[:-1] # Get post's score if it is listed (otherwise, post is job posting) if subtext_row.find('span', 'score'): point_count = subtext_row.find('span', 'score').get_text().split()[0] else: point_count = 0 type = 'job' # Add feed-based post data to database feed_post = models.FeedPost(comment_count=comment_count, feed_id=feed_id, feed_rank=feed_rank, point_count=point_count, post_id=post_id) session.add(feed_post) session.commit() # Create asynchronous task to scrape post page for its comments loop.create_task(scrape_post(post_id, feed_id, loop, None)) return
def _import(self, message): """import <url> [<alias(es)>] - imports all aliases from the given address, or only the listed aliases""" if message.User.Name not in GlobalVars.admins: return IRCResponse(ResponseType.Say, u"Only my admins may import aliases!", message.ReplyTo) if len(message.ParameterList) < 2: return IRCResponse(ResponseType.Say, u"You didn't give a url to import from!", message.ReplyTo) if len(message.ParameterList) > 2: onlyListed = True importList = [alias.lower() for alias in message.ParameterList[2:]] else: onlyListed = False url = message.ParameterList[1] try: page = WebUtils.fetchURL(url) except ValueError: return IRCResponse(ResponseType.Say, u"'{}' is not a valid URL".format(url), message.ReplyTo) if page is None: return IRCResponse(ResponseType.Say, u"Failed to open page at {}".format(url), message.ReplyTo) text = page.body text = UnicodeDammit(text).unicode_markup lines = text.splitlines() numAliases = 0 numHelpTexts = 0 for lineNumber, line in enumerate(lines): # Skip over blank lines if line == u"": continue splitLine = line.split() if splitLine[0].lower() != u"{}alias".format(self.bot.commandChar): return IRCResponse(ResponseType.Say, u"Line {} at {} does not begin with {}alias".format(lineNumber, url, self.bot.commandChar), message.ReplyTo) subCommand = splitLine[1].lower() if subCommand not in [u"add", u"help"]: return IRCResponse(ResponseType.Say, u"Line {} at {} is not an add or help command".format(lineNumber, url), message.ReplyTo) aliasName = splitLine[2].lower() aliasCommand = splitLine[3:] aliasCommand[0] = aliasCommand[0].lower() # Skip over aliases that weren't listed, if any were listed if onlyListed and aliasName not in importList: continue if subCommand == u"add": self._newAlias(aliasName, aliasCommand) numAliases += 1 elif subCommand == u"help": aliasHelp = u" ".join(splitLine[3:]) self.aliasHelpDict[aliasName] = aliasHelp numHelpTexts += 1 return IRCResponse(ResponseType.Say, u"Imported {} alias(es) and {} help string(s) from {}".format(numAliases, numHelpTexts, url), message.ReplyTo)
def wordprocessing(self, database, language, lemmatizer, news_comments, news_comments_start_date, news_comments_end_date, exclude_vowels, stopwords, stemmer, upload_textarea, upload_option, ignore_results_amount, upload_url): self.total_steps = 7 self.current_step = 1 self.cs_name = "Initializing" self.reset_amount() self.cs_total_amount = 0 self.update_meta() #Language check if language not in ['english', 'dutch']: return {'status': 'error', 'message': "Invalid language!"} if database not in connections: return {'status': 'error', 'message': "Invalid database!"} self.current_step = 2 self.cs_name = "Normalizing Input" self.reset_amount() self.cs_total_amount = 0 self.update_meta() #Input normalization if upload_option == 'text_field': input_text = upload_textarea elif upload_option == 'url': page_text = requests.get(upload_url).text soup = BeautifulSoup(page_text, "html.parser") input_text = soup.text elif upload_option == 'file': input_text = UnicodeDammit(upload_file.read()).unicode_markup elif upload_option == 'news_comments': start_date_text = news_comments_start_date end_date_text = news_comments_end_date start_date = datetime.date( *[int(i) for i in start_date_text.split('-')]) end_date = datetime.date(*[int(i) for i in end_date_text.split('-')]) filters = { 'date__gte': start_date, 'date__lte': end_date, 'text__isnull': False } input_text = "" if news_comments in ['news', 'news_comments']: self.cs_name = "Normalizing Input - Reading Newsitems" queryset = Newsitem.objects\ .using(database)\ .filter(**filters)\ .select_related('text') self.cs_total_amount = queryset.count() for newsitem in queryset: input_text += "\n" + newsitem.text.text self.increment_amount_done() if news_comments in ['comments', 'news_comments']: self.cs_name = "Normalizing Input - Reading Comments" queryset = Comment.objects\ .using(database)\ .filter(**filters)\ .select_related('text') self.cs_total_amount = queryset.count() for comment in queryset: input_text += "\n" + comment.text.text self.increment_amount_done() #Stemmer selection if stemmer == 'no_stemmer': stemmer = None elif stemmer == 'porter': if language != 'english': return jsonify(status='error', message="Invalid language for stemmer porter!") stemmer = PorterStemmer() elif stemmer == 'snowball': stemmer = SnowballStemmer(language) else: return jsonify(status='error', message="Invalid stemmer!") #Lemmatizer selection if lemmatizer == 'lemmatizer_off': lemmatizer = None elif language == 'english': lemmatizer = lemmatizer_en else: lemmatizer = lemmatizer_nl #Stopwords selection if stopwords == 'no_stopwords': stopwords = None elif stopwords == 'our_stopwords': stopwords = obo.stopwords elif stopwords == 'custom_stopwords': custom_stopword_text = UnicodeDammit( input_json.get('custom_stopword_file').read()).unicode_markup stopwords = obo.stripNonAlphaNum(custom_stopword_text) self.current_step = 3 self.cs_name = "Wordlist creation" self.reset_amount() self.cs_total_amount = len(input_text) self.update_meta() #Process the text input_text_word_count = 0 resulting_text = "" final_wordlist = [] for word_type, word in text_processor.parse_text(input_text): if word_type == "non-word": resulting_text += word else: input_text_word_count += 1 processed_word = word if stemmer: processed_word = stemmer.stem(processed_word) if lemmatizer: processed_word = lemmatizer(processed_word) if not stopwords or processed_word not in stopwords: if exclude_vowels == 'exclude_vowels_yes': if language == 'english': regex = re_vowel_en else: regex = re_vowel_nl processed_word = regex.sub("", processed_word) resulting_text += processed_word final_wordlist.append(processed_word) self.cs_amount_done += len(word) self.calculate_status_update() self.current_step = 4 self.cs_name = "obo.wordListToFreqDict" self.reset_amount() self.cs_total_amount = 0 self.update_meta() dictionary = obo.wordListToFreqDict(final_wordlist) self.current_step = 5 self.cs_name = "obo.sortFreqDict" self.reset_amount() self.cs_total_amount = 0 self.update_meta() sorteddict = obo.sortFreqDict(dictionary) self.current_step = 6 self.cs_name = "Dealing with Ignored Results" self.reset_amount() self.cs_total_amount = 0 self.update_meta() ignore_results_amount = int(ignore_results_amount) if ignore_results_amount > 0: self.cs_total_amount = len(resulting_text) initial_index = ignore_results_amount ignored_words = [word for rank, word in sorteddict[:initial_index]] sorteddict = sorteddict[initial_index:] new_text = "" new_wordlist = [] for word_type, word in text_processor.parse_text(resulting_text): if word_type == "non-word": new_text += word elif word not in ignored_words: new_text += word new_wordlist.append(word) self.cs_amount_done += len(word) self.calculate_status_update() resulting_text = new_text final_wordlist = new_wordlist else: initial_index = 0 self.current_step = 7 self.cs_name = "Doing the math" self.reset_amount() self.cs_total_amount = 0 self.update_meta() input_text_char_count = len(input_text) word_count = len(final_wordlist) distinct_words_count = len(sorteddict) words = [] frequencies = [] word_cloud = [] for frequency, word in sorteddict: words.append(word) frequencies.append(frequency) word_cloud.append([word, frequency]) acum_perc = Decimal(0) percentages = [] acum_perc_list = [] for freq in frequencies: perc = Decimal((freq * 100.0) / word_count) percentages.append(round(perc, 2)) acum_perc += perc acum_perc_list.append(round(acum_perc, 2)) logarithms = [] for i in range(len(sorteddict)): logarithms.append((math.log(i + 1), math.log(frequencies[i]))) #Calculate Linear regression #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq x = numpy.array([math.log(f) for f in frequencies]) y = numpy.array( [math.log(rank) for rank in range(1, distinct_words_count + 1)]) A = numpy.vstack([x, numpy.ones(len(x))]).T m, c = numpy.linalg.lstsq(A, y)[0] #Calculate the regression line start and end, # and sort making the start be the one with the lower X value # (highcharts requires this) regline_start = (0, c) regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c) regression_line = {'start': regline_start, 'end': regline_end} return { 'results': { 'status': 'success', 'words': words, 'frequencies': frequencies, 'percentages': percentages, 'acum_perc_list': acum_perc_list, 'logarithms': logarithms, 'regression_line': regression_line, 'resulting_text': resulting_text, 'input_text_char_count': input_text_char_count, 'input_text_word_count': input_text_word_count, 'output_text_word_count': word_count, 'word_cloud': word_cloud, 'sorteddict': sorteddict } }
def process_txt(self, fileobj): return UnicodeDammit.detwingle(fileobj.read())
def _sub_read(self, f): """ Parameters ---------- f : file buffer A file buffer for an LibSVM file. Yields ------ curr_id : str The current ID for the example. class_name : float or str The name of the class label for the example. example : dict The example valued in dictionary format, with 'x' as list of features. Raises ------ ValueError If line does not look like valid libsvm format. """ for example_num, line in enumerate(f): curr_id = '' # Decode line if it's not already str if isinstance(line, bytes): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup match = self.line_regex.search(line.strip()) if not match: raise ValueError('Line does not look like valid libsvm format' '\n{}'.format(line)) # Metadata is stored in comments if this was produced by SKLL if match.group('comments') is not None: # Store mapping from feature numbers to names if match.group('feat_map'): feat_map = {} for pair in match.group('feat_map').split(): number, name = pair.split('=') for orig, replacement in \ LibSVMReader.LIBSVM_REPLACE_DICT.items(): name = name.replace(orig, replacement) feat_map[number] = name else: feat_map = None # Store mapping from label/class numbers to names if match.group('label_map'): label_map = dict( pair.split('=') for pair in match.group('label_map').strip().split()) else: label_map = None curr_id = match.group('example_id').strip() if not curr_id: curr_id = 'EXAMPLE_{}'.format(example_num) class_num = match.group('label_num') # If we have a mapping from class numbers to labels, get label if label_map: class_name = label_map[class_num] else: class_name = class_num class_name = safe_float(class_name, replace_dict=self.class_map) curr_info_dict = dict( self._pair_to_tuple(pair, feat_map) for pair in match.group('features').strip().split()) yield curr_id, class_name, curr_info_dict
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') label_group = parser.add_mutually_exclusive_group(required=False) label_group.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') label_group.add_argument('--no_labels', action='store_true', default=False, help='Used to indicate that the input data has no labels.') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None label_col = None if args.no_labels else args.label_col # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is CSVWriter or writer_type is TSVWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
print() print("Here we go for some kickass movie script parsing!") print() print() print("Start by telling me when the introduction will end.") for block in script_text.descendants: # Si block est une instance de bs4.Tag, il est entouré de balises HTML # Le prochain block contiendra le même texte sans les balises # Donc on continue sans parser ce bloc if (isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n') # if the block doesn't have any text, skip it if (re.search('\w', block) == None): continue # bs4 ne coupe pas toujours bien les différents blocs # Mieux vaut donc redécouper par paragraphe et les traiter un à un for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if (re.search('\w', line) == None): continue print(
def extract_css(html_input, basename='sample.html', prettify_html=False): """Scan `html_input` and replace all styles with single link to a CSS file. Returns tuple ``<MODIFIED_HTML>, <CSS-CODE>``. If the `html_input` contains any ``<style>`` tags, their content is aggregated and returned in ``<CSS-CODE``. The tags are all stripped from `html` input and replaced by a link to a stylesheet file named ``<basename>.css``. Any extension in `basename` is stripped. So ``sample.html`` as `basename` will result in a link to ``sample.css``. The same applies for a `basename` ``sample.css`` or ``sample``. The modified HTML code is returned as first item of the result tuple. If `pretify_html` is True, the generated HTML code is prettified by BeautifulSoup. This might result in unexpected, visible gaps in rendered output. """ # create HTML massage that removes CDATA and HTML comments in styles for fix, m in CDATA_MASSAGE: html_input = fix.sub(m, html_input) soup = BeautifulSoup(html_input, 'html.parser') css = '\n'.join([style.text for style in soup.findAll('style')]) if '<style>' in css: css = css.replace('<style>', '\n') # lowercase leading tag names css = re.sub(RE_CSS_TAG, lambda match: match.group(1).lower() + match.group(2) + '{', css) # set indent of all CSS statement lines to nil. css = re.sub(RE_CSS_STMT_START, lambda match: '\n' + match.group(1), css) # insert spaces after and before curly brackets. css = re.sub(RE_CURLY_OPEN, lambda match: '{ ' + match.group(1), css) css = re.sub(RE_CURLY_CLOSE, lambda match: match.group(1) + ' }', css) css_name = os.path.splitext(basename)[0] + '.css' # Remove empty style comments css = re.sub(RE_EMPTY_COMMENTS, lambda match: '', css) if css.startswith('\n'): css = css[1:] for num, style in enumerate(soup.findAll('style')): if num == 0 and css != '': # replace first style with link to stylesheet # if there are any styles contained new_tag = soup.new_tag('link', rel='stylesheet', type='text/css', href=css_name) style.replace_with(new_tag) else: style.extract() if css == '': css = None if prettify_html: return soup.prettify(), css return UnicodeDammit(str(soup)).markup, css
def get_file_encoding(file_path): with open(file_path, 'rb') as file: content = file.read() suggestion = UnicodeDammit(content) return suggestion.original_encoding
def normalize(s): try: u = UnicodeDammit.detwingle(s).decode("utf8") except: u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup return u
def slim_html(self, raw_html): doc = UnicodeDammit.detwingle(raw_html) soup = BeautifulSoup(doc, "html5lib", from_encoding="utf-8") return soup.prettify().encode("utf-8");
def post(self, group_id): """Automatically registers several student accounts based on a CSV.""" group = api.group.get_group(gid=group_id) if not group: raise PicoException('Classroom not found', 404) curr_user = api.user.get_user() if (curr_user['tid'] not in (group['teachers'] + [group['owner']]) and not curr_user['admin']): raise PicoException( 'You do not have permission to batch-register students into ' + 'this classroom.', status_code=403 ) # Load in student demographics from CSV req = batch_registration_req.parse_args(strict=True) students = [] unicoded_csv = UnicodeDammit(req['csv'].read()) # Forcibly unicodify csv_reader = csv.DictReader( unicoded_csv.unicode_markup.split('\n')) try: for row in csv_reader: row = {k: v.strip() for k, v in row.items()} # Trim whitespace students.append(row) except csv.Error as e: raise PicoException( f"Error reading CSV at line {csv_reader.line_num}: {e}", status_code=400) # Check whether registering these students would exceed maximum # batch registrations per teacher account config = api.config.get_settings() teacher_metadata = api.token.find_key({ 'uid': api.user.get_user()['uid'] }) if not teacher_metadata: existing_batch_count = 0 else: existing_batch_count = teacher_metadata.get( "tokens", {}).get('batch_registered_students', 0) potential_batch_count = existing_batch_count + len(students) if (potential_batch_count > config['max_batch_registrations']): raise PicoException( "You have exceeded the maximum number of batch-registered " + "student accounts. Please contact an administrator.", 403 ) # Validate demographics def validate_current_year(s): try: n = int(s) if not (1 <= n <= 12): raise ValueError except ValueError: raise ValidationError( f'Grade must be between 1 and 12 (provided {s})') class BatchRegistrationUserSchema(Schema): # Convert empty strings to Nones when doing validation # to allow optional parent_email value for age 18+, # but back to '' before storing in database. @pre_load def empty_to_none(self, in_data, **kwargs): for k, v in in_data.items(): if v == "": in_data[k] = None return in_data @post_load def none_to_empty(self, in_data, **kwargs): for k, v in in_data.items(): if v is None: in_data[k] = '' return in_data current_year = fields.Str( data_key='Grade (1-12)', required=True, validate=validate_current_year) age = fields.Str( data_key='Age (13-17 or 18+)', required=True, validate=validate.OneOf(choices=['13-17', '18+'])) gender = fields.Str( data_key="Gender", required=False, allow_none=True, validate=validate.OneOf( ['male', 'female', 'nb/gf', 'nl/no'], ['Male', 'Female', 'Non-Binary/Gender-Fluid', 'Not listed/Prefer not to answer'], error="If specified, must be one of {labels}. Please use " "the corresponding code from: {choices}." ) ) parent_email = fields.Email( data_key='Parent Email (if under 18)', required=True, allow_none=True ) @validates_schema def validate_parent_email(self, data, **kwargs): if (data['age'] == '13-17' and data['parent_email'] is None): raise ValidationError( 'Parent email must be specified for students under 18') try: students = BatchRegistrationUserSchema().load( students, many=True, unknown=RAISE) except ValidationError as err: raise PicoException(err.messages, status_code=400) # Batch-register accounts curr_teacher = api.user.get_user() created_accounts = api.group.batch_register( students, curr_teacher, group_id) if len(created_accounts) != len(students): raise PicoException( "An error occurred while adding student accounts. " + f"The first {len(created_accounts)} were created. " + "Please contact an administrator." ) output = [] for i in range(len(students)): output.append({ 'Grade (1-12)': students[i]['current_year'], 'Age (13-17 or 18+)': students[i]['age'], 'Gender': students[i]['gender'], 'Parent Email (if under 18)': students[i]['parent_email'], 'Username': created_accounts[i]['username'], 'Password': created_accounts[i]['password'] }) buffer = io.StringIO() csv_writer = csv.DictWriter(buffer, [ 'Grade (1-12)', 'Age (13-17 or 18+)', 'Gender', 'Parent Email (if under 18)', 'Username', 'Password' ]) csv_writer.writeheader() csv_writer.writerows(output) output_csv_bytes = buffer.getvalue().encode('utf-8') return jsonify({ 'success': True, 'accounts': created_accounts, 'as_csv': base64.b64encode(output_csv_bytes).decode('utf-8') })
def _import(self, message): """import <url> [<alias(es)>] - imports all aliases from the given address, or only the listed aliases""" if len(message.ParameterList) < 2: return IRCResponse(ResponseType.Say, u"You didn't give a url to import from!", message.ReplyTo) if len(message.ParameterList) > 2: onlyListed = True importList = [alias.lower() for alias in message.ParameterList[2:]] else: onlyListed = False url = message.ParameterList[1] try: page = self.bot.moduleHandler.runActionUntilValue('fetch-url', url) except ValueError: return IRCResponse(ResponseType.Say, u"'{}' is not a valid URL".format(url), message.ReplyTo) if page is None: return IRCResponse(ResponseType.Say, u"Failed to open page at {}".format(url), message.ReplyTo) text = page.body text = UnicodeDammit(text).unicode_markup lines = text.splitlines() numAliases = 0 numHelpTexts = 0 for lineNumber, line in enumerate(lines): # Skip over blank lines if line == u"": continue splitLine = line.split() if splitLine[0].lower() != u"{}alias".format(self.bot.commandChar): return IRCResponse(ResponseType.Say, u"Line {} at {} does not begin with {}alias".format(lineNumber, url, self.bot.commandChar), message.ReplyTo) subCommand = splitLine[1].lower() if subCommand not in [u"add", u"help"]: return IRCResponse(ResponseType.Say, u"Line {} at {} is not an add or help command".format(lineNumber, url), message.ReplyTo) aliasName = splitLine[2].lower() aliasCommand = splitLine[3:] aliasCommand[0] = aliasCommand[0].lower() # Skip over aliases that weren't listed, if any were listed if onlyListed and aliasName not in importList: continue if subCommand == u"add": self._newAlias(aliasName, u" ".join(aliasCommand)) numAliases += 1 elif subCommand == u"help": aliasHelp = u" ".join(splitLine[3:]) self._setAliasHelp(aliasName, aliasHelp) numHelpTexts += 1 self._syncAliases() return IRCResponse(ResponseType.Say, u"Imported {} alias(es) and {} help string(s) from {}".format(numAliases, numHelpTexts, url), message.ReplyTo)
import requests from bs4 import BeautifulSoup from bs4 import UnicodeDammit import csv url = "http://www.billboard.com/charts/hot-100" r = requests.get(url) page_cont = r.content print(page_cont) suggestion = UnicodeDammit(page_cont) suggestion.original_encoding #suggestion.unicode_markup page_cont_par = BeautifulSoup(r.content, "html.parser") containers = page_cont_par.findAll("div", {"class": "chart-row__main-display"}) filename = "musics.csv" f = open(filename, "w") headers = "posicao;musica;artista\n" f.write(headers) container = containers[0] for container in containers: posicao = container.div.span.text.strip() musica = container.h2.text.strip() artista = container.a.text.strip() print("posicao: " + posicao) print("musica: " + musica) print("artista: " + artista) f.write(posicao + ";" + musica.replace(";", "|") + ";" + artista.replace(";", "|") + "\n") f.close()
def importFitFromFiles(paths, iportuser=None): """ Imports fits from file(s). First processes all provided paths and stores assembled fits into a list. This allows us to call back to the GUI as fits are processed as well as when fits are being saved. returns """ sFit = svcFit.getInstance() fit_list = [] try: for path in paths: if iportuser: # Pulse msg = "Processing file:\n%s" % path pyfalog.debug(msg) processing_notify(iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, msg) # wx.CallAfter(callback, 1, msg) with open(path, "rb") as file_: srcString = file_.read() dammit = UnicodeDammit(srcString) srcString = dammit.unicode_markup if len(srcString) == 0: # ignore blank files pyfalog.debug("File is blank.") continue try: importType, makesNewFits, fitsImport = Port.importAuto(srcString, path, iportuser=iportuser) fit_list += fitsImport except xml.parsers.expat.ExpatError: pyfalog.warning("Malformed XML in:\n{0}", path) return False, "Malformed XML in %s" % path # IDs = [] # NOTE: what use for IDs? numFits = len(fit_list) for idx, fit in enumerate(fit_list): # Set some more fit attributes and save fit.character = sFit.character fit.damagePattern = sFit.pattern fit.targetProfile = sFit.targetProfile if len(fit.implants) > 0: fit.implantLocation = ImplantLocation.FIT else: useCharImplants = sFit.serviceFittingOptions["useCharacterImplantsByDefault"] fit.implantLocation = ImplantLocation.CHARACTER if useCharImplants else ImplantLocation.FIT db.save(fit) # IDs.append(fit.ID) if iportuser: # Pulse pyfalog.debug("Processing complete, saving fits to database: {0}/{1}", idx + 1, numFits) processing_notify( iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, "Processing complete, saving fits to database\n(%d/%d) %s" % (idx + 1, numFits, fit.ship.name) ) except UserCancelException: return False, "Processing has been canceled.\n" except (KeyboardInterrupt, SystemExit): raise except Exception as e: pyfalog.critical("Unknown exception processing: {0}", path) pyfalog.critical(e) # TypeError: not all arguments converted during string formatting # return False, "Unknown Error while processing {0}" % path return False, "Unknown error while processing {}\n\n Error: {} {}".format( path, type(e).__name__, getattr(e, 'message', '')) return True, fit_list
def store_subtitles(file): logging.debug('BAZARR started subtitles indexing for this file: ' + file) actual_subtitles = [] if os.path.exists(file): # notifications.write(msg='Analyzing this file for subtitles: ' + file, queue='list_subtitles') if settings.general.getboolean('use_embedded_subs'): logging.debug("BAZARR is trying to index embedded subtitles.") try: subtitle_languages = embedded_subs_reader.list_languages(file) for subtitle_language in subtitle_languages: try: if alpha2_from_alpha3(subtitle_language) is not None: lang = str(alpha2_from_alpha3(subtitle_language)) logging.debug( "BAZARR embedded subtitles detected: " + lang) actual_subtitles.append([lang, None]) except: logging.debug( "BAZARR unable to index this unrecognized language: " + subtitle_language) pass except Exception as e: logging.exception( "BAZARR error when trying to analyze this %s file: %s" % (os.path.splitext(file)[1], file)) pass brazilian_portuguese = [".pt-br", ".pob", "pb"] try: dest_folder = get_subtitle_destination_folder() subliminal_patch.core.CUSTOM_PATHS = [dest_folder ] if dest_folder else [] subtitles = search_external_subtitles( file, languages=get_language_set(), only_one=settings.general.getboolean('single_language')) except Exception as e: logging.exception("BAZARR unable to index external subtitles.") pass else: for subtitle, language in subtitles.iteritems(): subtitle_path = get_external_subtitles_path(file, subtitle) if str(os.path.splitext(subtitle)[0]).lower().endswith( tuple(brazilian_portuguese)): logging.debug("BAZARR external subtitles detected: " + "pb") actual_subtitles.append( [str("pb"), path_replace_reverse(subtitle_path)]) elif str(language) != 'und': logging.debug("BAZARR external subtitles detected: " + str(language)) actual_subtitles.append( [str(language), path_replace_reverse(subtitle_path)]) else: if os.path.splitext(subtitle)[1] != ".sub": logging.debug( "BAZARR falling back to file content analysis to detect language." ) with open( path_replace( os.path.join(os.path.dirname(file), subtitle)), 'r') as f: text = list(islice(f, 100)) text = ' '.join(text) encoding = UnicodeDammit(text) try: text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + path_replace( os.path.join(os.path.dirname(file), subtitle)) + ' You should try to delete this subtitles file manually and ask Bazarr to download it again.' ) else: if len(detected_language) > 0: logging.debug( "BAZARR external subtitles detected and analysis guessed this language: " + str(detected_language)) actual_subtitles.append([ str(detected_language), path_replace_reverse( os.path.join( os.path.dirname(file), subtitle)) ]) conn_db = sqlite3.connect(os.path.join(args.config_dir, 'db', 'bazarr.db'), timeout=30) c_db = conn_db.cursor() logging.debug("BAZARR storing those languages to DB: " + str(actual_subtitles)) c_db.execute("UPDATE table_episodes SET subtitles = ? WHERE path = ?", (str(actual_subtitles), path_replace_reverse(file))) conn_db.commit() c_db.close() else: logging.debug( "BAZARR this file doesn't seems to exist or isn't accessible.") logging.debug('BAZARR ended subtitles indexing for this file: ' + file) return actual_subtitles
def soup_in(filename): return BeautifulSoup( UnicodeDammit.detwingle(open(filename).read()).decode('utf8'))
## MAIN FILE my_path = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/' all_files = [f for f in os.listdir(my_path) if os.path.isfile(os.path.join(my_path, f))] data = {} #files = ['00006.html', '05111108.html', '120394.html', '1bettyevans.html'] #files = ['05111108.html'] files = all_files[1000:] for html_file in files: with open(os.path.join(my_path, html_file)) as f: s = str(f.readlines()) new_s = UnicodeDammit.detwingle(s) new_s = new_s.decode("utf-8") soup = BeautifulSoup(new_s, 'html.parser') summary = extractSummary(soup) names = extractName(soup) opath = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/' ofile = os.path.join(opath, "output0.txt") #printSummaryRows(summary, opath) # printPhotoRows(photos, opath) # printSkillRows(skills, opath) # soup = BeautifulSoup(s, 'html.parser') # full_name = soup.find('span', {'class': 'full-name'}) # summary = soup.find('div', {'class':'summary'}) # if full_name:
def clean_unicode(comment_str): comment_str = comment_str.replace('\n', '').replace('\r', '').strip() comment_str = ' '.join(comment_str.split()) return UnicodeDammit(comment_str).unicode_markup
def store_subtitles(file): logging.debug('BAZARR started subtitles indexing for this file: ' + file) actual_subtitles = [] if os.path.exists(file): if os.path.splitext(file)[1] == '.mkv': logging.debug("BAZARR is trying to index embedded subtitles.") try: with open(file, 'rb') as f: mkv = enzyme.MKV(f) for subtitle_track in mkv.subtitle_tracks: try: if alpha2_from_alpha3(subtitle_track.language) != None: lang = str( alpha2_from_alpha3(subtitle_track.language)) logging.debug( "BAZARR embedded subtitles detected: " + lang) actual_subtitles.append([lang, None]) except: logging.debug( "BAZARR unable to index this unrecognized language: " + subtitle_track.language) pass except Exception as e: logging.exception( "BAZARR error when trying to analyze this mkv file: " + file) pass else: logging.debug("BAZARR This file isn't an .mkv file.") brazilian_portuguese = [".pt-br", ".pob", "pb"] try: subtitles = core.search_external_subtitles(file) except Exception as e: logging.exception("BAZARR unable to index external subtitles.") pass else: for subtitle, language in subtitles.iteritems(): if str(os.path.splitext(subtitle)[0]).lower().endswith( tuple(brazilian_portuguese)) is True: logging.debug("BAZARR external subtitles detected: " + "pb") actual_subtitles.append([ str("pb"), path_replace_reverse( os.path.join(os.path.dirname(file), subtitle)) ]) elif str(language) != 'und': logging.debug("BAZARR external subtitles detected: " + str(language)) actual_subtitles.append([ str(language), path_replace_reverse( os.path.join(os.path.dirname(file), subtitle)) ]) else: if os.path.splitext(subtitle)[1] != ".sub": logging.debug( "BAZARR falling back to file content analysis to detect language." ) with open( path_replace( os.path.join(os.path.dirname(file), subtitle)), 'r') as f: text = list(islice(f, 100)) text = ' '.join(text) encoding = UnicodeDammit(text) try: text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + path_replace( os.path.join(os.path.dirname(file), subtitle)) + ' You should try to delete this subtitles file manually and ask Bazarr to download it again.' ) else: if len(detected_language) > 0: logging.debug( "BAZARR external subtitles detected and analysis guessed this language: " + str(detected_language)) actual_subtitles.append([ str(detected_language), path_replace_reverse( os.path.join( os.path.dirname(file), subtitle)) ]) conn_db = sqlite3.connect(os.path.join(config_dir, 'db/bazarr.db'), timeout=30) c_db = conn_db.cursor() logging.debug("BAZARR storing those languages to DB: " + str(actual_subtitles)) c_db.execute("UPDATE table_episodes SET subtitles = ? WHERE path = ?", (str(actual_subtitles), path_replace_reverse(file))) conn_db.commit() c_db.close() else: logging.debug( "BAZARR this file doesn't seems to exist or isn't accessible.") logging.debug('BAZARR ended subtitles indexing for this file: ' + file) return actual_subtitles