Ejemplo n.º 1
0
    def learn(self, name, phrase, channel):
        name = self.aliases.resolve(name)
        if name not in self.users:
            self.users[name] = True

        if "password" in phrase:
            return
        phrase = phrase.split(" ")
        phrase = filter(lambda x: x and "http" not in x and "ftp:" not in x and x[0] != ".", phrase)
        now = datetime.datetime.utcnow()
        documents = []

        for i in range(len(phrase) + 1):
            seed = UnicodeDammit.detwingle(phrase[i-1] if i > 0 else "")
            answer = UnicodeDammit.detwingle(phrase[i] if i < len(phrase) else "")

            documents.append({
                "name": name,
                "seed": seed,
                "answer": answer,
                "added": now,
                "random": random.random()
            })

        yield self.db.insert(documents, safe=True)
Ejemplo n.º 2
0
    def create(self, soupfragment):
        result = dict()
        field = self._getfield_info(soupfragment)
        title = ""
        result["link"] = ""
        result["answers"] = ""
        result["views"] = ""
        result["location"] = ""
        if self.urlobject is not None:
            result["location"] = self.urlobject.description()

        #result['location'] = self.webclient.get_url_desc()
        if field is not None:
            title = UnicodeDammit(field.a.contents[0]).unicode_markup
            result["link"] = field.a['href']
            fragment = self._get_answer_and_viewa_fragment(soupfragment)
            if fragment is not None:
                result["answers"] = self._get_number_from(fragment.contents[0].strip())
                result["views"] = self._get_number_from(fragment.contents[2].strip())
            else:
                print "No answer and view bloq identified in thread: ", result["link"]
                result["answers"] = -1
                result["views"] = -1

        result["title"] = title.strip()

        #result['next_url'] = _nextUrl(soupfragment)
        return result
Ejemplo n.º 3
0
    def _sub_read(self, f):
        example_num = 0
        curr_id = 'EXAMPLE_0'
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                line = UnicodeDammit(line, ['utf-8',
                                            'windows-1252']).unicode_markup
            line = line.strip()
            # Handle instance lines
            if line.startswith('#'):
                curr_id = line[1:].strip()
            elif line and line not in ['TRAIN', 'TEST', 'DEV']:
                split_line = line.split()
                num_cols = len(split_line)
                del line
                # Line is just a class label
                if num_cols == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = []
                # Line has a class label and feature-value pairs
                elif num_cols % 2 == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = split_line[1:]
                # Line just has feature-value pairs
                elif num_cols % 2 == 0:
                    class_name = None
                    field_pairs = split_line

                curr_info_dict = {}
                if len(field_pairs) > 0:
                    # Get current instances feature-value pairs
                    field_names = islice(field_pairs, 0, None, 2)
                    # Convert values to floats, because otherwise
                    # features'll be categorical
                    field_values = (safe_float(val) for val in
                                    islice(field_pairs, 1, None, 2))

                    # Add the feature-value pairs to dictionary
                    curr_info_dict.update(zip(field_names, field_values))

                    if len(curr_info_dict) != len(field_pairs) / 2:
                        raise ValueError(('There are duplicate feature ' +
                                          'names in {} for example ' +
                                          '{}.').format(self.path_or_list,
                                                        curr_id))

                yield curr_id, class_name, curr_info_dict

                # Set default example ID for next instance, in case we see a
                # line without an ID.
                example_num += 1
                curr_id = 'EXAMPLE_{}'.format(example_num)
 def corpus_generator(self):
     with open(self.corpus_path, 'rb') as f:
         i = 0
         for line in f:
             line = UnicodeDammit(line.strip()).unicode_markup
             if line:
                 if self.lower:
                     line = line.lower()
                 i += 1
                 if i % 100000 == 0:
                     logging.info('Read {} nonblank lines'.format(i))
                 for tok in re.split(r'\s+', line):
                     yield tok
Ejemplo n.º 5
0
    def _sub_read(self, f):
        for example_num, line in enumerate(f):
            curr_id = ''
            label_map = None
            feat_map = None
            # Decode line if it's not already str
            if isinstance(line, bytes):
                line = UnicodeDammit(line, ['utf-8',
                                            'windows-1252']).unicode_markup
            match = self.line_regex.search(line.strip())
            if not match:
                raise ValueError('Line does not look like valid libsvm format'
                                 '\n{}'.format(line))
            # Metadata is stored in comments if this was produced by SKLL
            if match.group('comments') is not None:
                # Store mapping from feature numbers to names
                if match.group('feat_map'):
                    feat_map = {}
                    for pair in match.group('feat_map').split():
                        number, name = pair.split('=')
                        for orig, replacement in \
                                LibSVMReader.LIBSVM_REPLACE_DICT.items():
                            name = name.replace(orig, replacement)
                        feat_map[number] = name
                else:
                    feat_map = None
                # Store mapping from label/class numbers to names
                if match.group('label_map'):
                    label_map = dict(pair.split('=') for pair in
                                     match.group('label_map').strip().split())
                else:
                    label_map = None
                curr_id = match.group('example_id').strip()

            if not curr_id:
                curr_id = 'EXAMPLE_{}'.format(example_num)

            class_num = match.group('label_num')
            # If we have a mapping from class numbers to labels, get label
            if label_map:
                class_name = label_map[class_num]
            else:
                class_name = class_num
            class_name = safe_float(class_name,
                                    replace_dict=self.class_map)

            curr_info_dict = dict(self._pair_to_tuple(pair, feat_map) for pair
                                  in match.group('features').strip().split())

            yield curr_id, class_name, curr_info_dict
Ejemplo n.º 6
0
    def _sub_read(self, f):
        field_names = []
        # Process ARFF header
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                decoded_line = UnicodeDammit(line,
                                             ['utf-8',
                                              'windows-1252']).unicode_markup
            else:
                decoded_line = line
            line = decoded_line.strip()
            # Skip empty lines
            if line:
                # Split the line using CSV reader because it can handle
                # quoted delimiters.
                split_header = self.split_with_quotes(line)
                row_type = split_header[0].lower()
                if row_type == '@attribute':
                    # Add field name to list
                    field_name = split_header[1]
                    field_names.append(field_name)
                    # Check if we're doing regression
                    if field_name == self.label_col:
                        self.regression = (len(split_header) > 2 and
                                           split_header[2] == 'numeric')
                # Save relation if specified
                elif row_type == '@relation':
                    self.relation = split_header[1]
                # Stop at data
                elif row_type == '@data':
                    break
                # Skip other types of rows (relations)

        # Create header for CSV
        if PY2:
            io_type = BytesIO
        else:
            io_type = StringIO
        with io_type() as field_buffer:
            csv.writer(field_buffer, dialect='arff').writerow(field_names)
            field_str = field_buffer.getvalue()

        # Set label_col to be the name of the last field, since that's standard
        # for ARFF files
        if self.label_col != field_names[-1]:
            self.label_col = None

        # Process data as CSV file
        return super(ARFFReader, self)._sub_read(chain([field_str], f))
Ejemplo n.º 7
0
def convert_to_libsvm(lines):
    '''
    Converts a sequence of lines (e.g., a file or list of strings) in MegaM
    format to LibSVM format.

    :param lines: The sequence of lines to convert.
    :type lines: L{file} or L{list} of L{str}

    :return: A tuple of the newly formatted data, the mappings from class names
             to numbers, and the mappings from feature names to numbers.
    :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
    '''

    # Initialize variables
    field_num_dict = UniqueNumberDict()
    class_num_dict = UniqueNumberDict()

    result_list = []
    # Iterate through MegaM file
    for line in lines:
        line_fields = set()
        # Process encoding
        line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()

        # Ignore comments (and TEST/DEV lines)
        if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
            result_string = ''
            split_line = line.split()
            result_string += '{0}'.format(class_num_dict[split_line[0]])
            # Handle features if there are any
            if len(split_line) > 1:
                del split_line[0]
                # Loop through all feature-value pairs printing out pairs
                # separated by commas (and with feature names replaced with
                # numbers)
                for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
                                                   (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
                    # Check for duplicates
                    if field_num in line_fields:
                        field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next()
                        raise AssertionError("Field {} occurs on same line twice.".format(field_name))
                    # Otherwise output non-empty features
                    elif value != 'N/A' and float(value):
                        result_string += ' {}:{}'.format(field_num, value)
                        line_fields.add(field_num)
            result_list.append(result_string)

    return result_list, class_num_dict, field_num_dict
Ejemplo n.º 8
0
    def ramble(self, name=None, seed=""):
        if name:
            name = self.aliases.resolve(name)
            if name not in self.users:
                returnValue("")

        message = []

        if seed:
            seed = UnicodeDammit.detwingle(seed)
            chunk = seed
            while chunk and len(" ".join(message)) < 300:
                message.append(chunk)
                chunk = yield self.prev(name, chunk)
            message.reverse()

        chunk = yield self.next(name, seed)
        while chunk and len(" ".join(message)) < 300:
            message.append(chunk)
            chunk = yield self.next(name, chunk)
            if not chunk and len(" ".join(message)) < 30:
                chunk = yield self.next(name, chunk)

        response = (" ".join(message)).decode("utf8")
        if seed and response == seed.decode("utf8"):
            response = yield self.ramble(name)
        returnValue(response)
Ejemplo n.º 9
0
    def _fetch_data(self, entry_name, url):
        # url = url.decode('utf-8')
        # if url[:5] == 'http:':
        #     url = 'https' + url[4:]
        # url = url.encode('utf-8')
        original_entry_name = entry_name
        data = dict()
        try:
            with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source:
                page_content = page_source.read()
            doc = UnicodeDammit(page_content, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.original_encoding)
            doc = lxml.html.document_fromstring(page_content, parser=parser)

            bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]')
            if not bar_name:
                bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]')
            if type(bar_name) is list and len(bar_name) > 0:
                entry_name = bar_name[0].text_content().strip()
            num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]')
            if not num_visits:
                num_visits = doc.xpath('//span[contains(@class, "card_menNum")]')
            num_posts = doc.xpath('//span[contains(@class, "j_post_num")]')
            if not num_posts:
                num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]')
            if type(num_visits) is list and len(num_visits) > 0:
                num_visits = num_visits[0].text_content()
                num_visits = cogtu_misc.get_first_number_from_text(num_visits)
            else:
                num_visits = 0
            if type(num_posts) is list and len(num_posts) > 0:
                num_posts = num_posts[0].text_content()
                num_posts = cogtu_misc.get_first_number_from_text(num_posts)
            else:
                num_posts = 0
            num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span")
            if type(num_groups) is list and len(num_groups) > 0:
                num_groups = num_groups[0].text_content()
                num_groups = cogtu_misc.get_first_number_from_text(num_groups)
            else:
                num_groups = 0
        except urllib2.HTTPError:
            logging.info('urllib2.HTTPError. Skip.')
            return None, None
        except urllib2.URLError:
            logging.info('urllib2.URLError. Skip.')
            return None, None

        data['num_visits'] = int(num_visits)
        data['num_posts'] = int(num_posts)
        data['num_groups'] = int(num_groups)
        data['entry_name'] = entry_name
        data['original_entry_name'] = original_entry_name
        data['url'] = url
        return entry_name, data
Ejemplo n.º 10
0
 def __init__(self,url):# logs info,warning,error,critical,debug events.
     '''
     Description: This is the class constructor and is going to get a simple url as input and parse it based on RFC1738.
     Status: In Progress.
     Usage: This is going to be used by by the connection manager and the active/passive scanner to extract url variables.
     '''
     self.url = UnicodeDammit.detwingle(url, 'UTF-8')        
     self.defaultHttpsPort = 443
     self.defaultHttpPort = 80
     urlLogger.logInfo("--- Package: UrlManager - Module: UrlHandler Class: urlHandler Initiated ---")
Ejemplo n.º 11
0
def remove_evernote_link(link, html):
    html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup
    link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup
    sep = u'<span style="color: rgb(105, 170, 53);"> | </span>'
    sep_regex = escape_regex(sep)
    no_start_tag_regex = r"[^<]*"
    regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>"
    # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html)
    # Remove link
    html = html.replace(link.WholeRegexMatch, "")
    # Remove empty li
    html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html)
    # Remove dangling separator

    regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex
    html = re.sub(regex_span, "", html)
    # Remove double separator
    html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html)
    return html
Ejemplo n.º 12
0
   def selectdir(geturl):
      r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      rt = UnicodeDammit.detwingle(r.text)
      html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
      if debug == 1:
         orenc = str(html.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      findlinks = html.findAll('a')
      dirlist = []
      for link in findlinks:
         b = link.get('href')
         if not re.match(r'^((\.\.)?\/)$', str(b)):
            if re.search(r'^(.*)(\/)$', str(b)):
               dirlist.append(b)

      p = urlparse(geturl)
      part = p.path.split('/')[-1]
      path = p.path.rstrip(part)
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path

      i = 0
      dirtotal = len(dirlist)
      if dirtotal > 0:
         print('\nFOUND %d DIRECTORIES: \n' % dirtotal)
         while i < dirtotal:
            sel = i + 1
            print(str(sel) + ' - ' + str(dirlist[i]))
            i += 1
         print('')
         lim = dirtotal + 1
         matchtop = r'^(%s)(\/)?$' % urlfqdn
         if not re.match(matchtop,geturl):
            print('0 - BACK TO PARENT DIRECTORY \n')
            startsel = '0-%d' % dirtotal
         else:
            startsel = '1-%d' % dirtotal
         selectdir = raw_input('make a selection [%s] --> ' % startsel)
         if not int(selectdir) in range(0, lim):
            selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel)
         if selectdir == '0':
            geturl = parent
            subcont = 0
         else:
            n = int(selectdir) - 1
            usedir = dirlist[n]
            geturl = parent + usedir
            subcont = 1
      else:
         print('\nNO DIRECTORIES FOUND. using current directory.. \n')
         subcont = 0
         geturl = parent + part
      return geturl, subcont, parent
Ejemplo n.º 13
0
	def clean_google_title(self, title):
		has_dot = False
		
		titleCleaned = UnicodeDammit(title).unicode_markup
		# clean step 1
		# BUGFIX: don't remove [xxx]. eg: "OQL[C++]: Ext...'
		titleCleaned = re.sub("(<(.*?)>)", "", titleCleaned)
		re_hasdot = re.compile("(\.\.\.|&hellip;)", re.I)
		match = re_hasdot.search(title)
		if match is not None:
			has_dot = True
			# clean step 2, here title is readable
		titleCleaned = re.sub("(&nbsp;|&#x25ba;|&hellip;)", "", titleCleaned)
		titleCleaned = re.sub("(&#.+?;|&.+?;)", "", titleCleaned)
		titleCleaned = titleCleaned.strip()
		readableTitle = titleCleaned
		# Shrink, only letters left
		titleCleaned = re.sub("\W", "", titleCleaned)
		titleCleaned = titleCleaned.lower()
		return (readableTitle, titleCleaned, has_dot)
Ejemplo n.º 14
0
def document_generator(path, lower=False):
    '''
    Default document reader.  Takes a path to a file with one document per line,
    with tokens separate by whitespace, and yields lists of tokens per document.
    This could be replaced by any function that yields lists of tokens.
    See main() for how it is called.

    Note: this uses BeautifulSoup's UnicodeDammit to convert to unicode.
    '''
    with open(path, 'rb') as f:
        i = 0
        for line in f:
            line = UnicodeDammit(line.strip()).unicode_markup
            if line:
                if lower:
                    line = line.lower()
                i += 1
                if i % 100000 == 0:
                    logging.info('Read {} nonblank lines'.format(i))
                yield re.split(r'\s+', line)
Ejemplo n.º 15
0
    def formatForReddit(self, feedEntry, postType, subreddit, raw):
        if 'content' in feedEntry:
          content = feedEntry['content'][0]['value']
        elif 'description' in feedEntry:
          content = feedEntry.description
        else:
          content = ''
        logging.debug(content)
        parser = EveRssHtmlParser()
        
        title = feedEntry['title']

        # some feeds like Twitter are raw so the parser hates it.
        if (raw):
          regex_of_url = '(https?:\/\/[\dA-z\.-]+\.[A-z\.]{2,6}[\/\w&=#\.\-\?]*)'
          title = re.sub(regex_of_url, '', title)
          clean_content = content.replace(' pic.twitter.com', ' http://pic.twitter.com')
          clean_content = re.sub(regex_of_url, '<a href="\\1">link</a>', clean_content)
          clean_content = UnicodeDammit.detwingle(clean_content)
          #logging.info(clean_content)
          u = UnicodeDammit(clean_content, 
                      smart_quotes_to='html', 
                      is_html = False )
          # fix twitter putting ellipses on the end
          content = u.unicode_markup.replace(unichr(8230),' ...')
          logging.debug('.....')
        
        if "tumblr.com" in content:
          # Replace with larger images (hopefully such images exist)
          content = content.replace('_500.', '_1280.')
        
        # Added the .replace because the parser does something funny to them and 
        # removes them before I can handle them
        content = content.replace('&nbsp;', ' ')
        content = content.replace('&bull;', '*').replace('&middot;','*')
        content = content.replace('&ldquo;','\'').replace('&rdquo;','\'')
        content = re.sub('( [ ]+)', ' ', content)
        parser.feed(content)
        parser.comments[0] = '%s\n\n%s' %(feedEntry['link'], parser.comments[0])
        parser.comments[-1] += self.config['signature']
        
        if 'author' in feedEntry:
          author = '~' + feedEntry['author'].replace('@', ' at ')
        else:
          author = ''

        return {'comments': parser.comments,
                'link':     feedEntry['link'],
                'subreddit': subreddit,
                'title':    '[%s] %s %s' %(postType, title, author)}
Ejemplo n.º 16
0
def getContent(soup, source=''):
    newContent = []
    # Cleanning phase
    genericCleaning(soup)
    sourceSpecificcleaning(soup, source)

    # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close();
    # Finding content in the tree
    bestElem = None; bestText = '';
    for el in soup.findAll(True):
        score = 0.0;  hasTitle = False
        if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]':
            score += 3
        for c in el:
            if c.name == 'br': # business insider style
                score += 0.5
            if c.name == 'p':
                score += 1.0
            if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']:
                score += 1.0
                hasTitle = True
        if score >= 3.0: # at least 3 paragraphs
            textOutput = getText(el)
            if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container
                newContent.append(textOutput)
        elif score >= 1.0:
            if bestElem is None:
                bestElem = el; bestText = getText(el, False)
            else:
                a = getText(el, False)
                if bestElem is None or len(a) > len(bestText):
                    bestElem = el; bestText = a
    if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more
        newContent.append(bestText)

    finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup
    return finalText.replace('\n\n', '\n')
Ejemplo n.º 17
0
def normalize(s):
    if isinstance(s, unicode):
        return s

    try:
        u = s.decode("utf8")
    except:
        try:
            u = (s[:-1]).decode("utf8")
        except:
            try:
                u = UnicodeDammit.detwingle(s).decode("utf8")
            except:
                u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup

    return u
Ejemplo n.º 18
0
 def getpage(cfurl):      
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    if 'text' in r.headers.get('Content-Type'):
       rt = UnicodeDammit.detwingle(r.text)
       html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
       print('\r\n--------------------------------------------------------\r\n')
       if debug == 1:
          orenc = str(html.original_encoding)
          print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
       bs = html.prettify(formatter=None)
       print(bs)
       print('\r\n--------------------------------------------------------\r\n')
    else:
       found = -1
    
    if debug == 1:
       print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
Ejemplo n.º 19
0
    def format(self, script):
        dammit = UnicodeDammit.detwingle(script)
        soup = BeautifulSoup(dammit, from_encoding="utf8")
        header = soup.find('subtitle_script')
        header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n";
        styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
        events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
        stylelist = soup.findAll('style')
        eventlist = soup.findAll('event')
        
        for style in stylelist:
            styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"

        for event in eventlist:
            events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"

        formattedSubs = header+styles+events
        return formattedSubs
Ejemplo n.º 20
0
def to_unicode(data, is_html=False, detwingle=False, verbose=True,
               lang=None):
    " converts everything to unicode"
    dammit = UnicodeDammit(data, is_html=is_html)
    if detwingle and dammit.original_encoding == 'windows-1252':
        new_data = UnicodeDammit.detwingle(data)
        dammit = UnicodeDammit(new_data, is_html=is_html)

    if verbose:
        sys.stderr.write("Original encoding (via BS): %s\n" %
                         (dammit.original_encoding))

    if lang is None:
        return dammit.unicode_markup

    if lang == 'auto':
        lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html)
        if verbose:
            sys.stderr.write("Detected language: %s\n" % (lang))

    return _to_unicode_chared(data, lang, verbose=verbose)
Ejemplo n.º 21
0
def unicode_dammit_example():
    # Install the 'chardet' or 'cchardet' Python libraries for better guesses

    ### Take a string with unknown encoding and make the string Unicode
    weirdass_string = "Sacr\xc3\xa9 bleu!"
    dammit = UnicodeDammit(weirdass_string)
    print "Original Word with weird encoding:", weirdass_string
    print "Dammit Print:", (dammit.unicode_markup)
    print "Dammit Type:", (dammit.original_encoding)

    ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
    # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
    snowmen = (u"\N{SNOWMAN}" * 3)
    quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
    doc = snowmen.encode("utf8") + quote.encode("windows-1252")
    # So now we have one doc with two encodings in it, printing is a mess
    #print "Weird Decoding doc with utf8:", doc # messed up, won't print
    #print (doc.decode("windows-1252")) # So messed up it doesn't even print

    # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
    new_doc = UnicodeDammit.detwingle(doc)
    print new_doc.decode("utf8")
Ejemplo n.º 22
0
 def getlinks(cfurl):
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    rt = UnicodeDammit.detwingle(r.text)
    html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
    if debug == 1:
       orenc = str(html.original_encoding)
       print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
    bs = html.prettify(formatter=None)
    linkresult = html.findAll('a')
    if len(linkresult) > 0:
       foundlinks = len(linkresult)
       print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl))
       for link in linkresult:
          b = link.get('href')
          b = str(b)
          if b not in cfurl and not re.match(r'^(\.\.)?\/$', b):
             print(b)
       print('')
    else:
       print('\nNO LINKS FOUND.\n')
       foundlinks = 0
    time.sleep(4)
    return foundlinks
Ejemplo n.º 23
0
    def to_unicode(data, is_html=False, detwingle=False, verbose=False,
                   lang=None):
        """ Produce unicode from text of unknown encoding.
        Input: bytestring """
        dammit = UnicodeDammit(data, is_html=is_html)
        if detwingle and dammit.original_encoding == 'windows-1252':
            new_data = UnicodeDammit.detwingle(data)
            dammit = UnicodeDammit(new_data, is_html=is_html)

        if verbose:
            sys.stderr.write("Original encoding (via BS): %s\n" %
                             (dammit.original_encoding))

        if lang is None:
            return dammit.unicode_markup

        if lang == 'auto':
            lang = TextSanitizer.guess_lang_from_data(
                dammit.unicode_markup, is_html=is_html)
            if verbose:
                sys.stderr.write("Detected language: %s\n" % (lang))

        return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
Ejemplo n.º 24
0
   def followlinks(bx):
      p = urlparse(bx)
      if '/' not in p.path[-1:]:
         part = p.path.split('/')[-1]
         path = p.path.rstrip(part)
      else:
         path = p.path
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path + '/'
      s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      print('\n----------------------------------------------------------- \n')
      print(s)
      print('\n')
      scr = UnicodeDammit.detwingle(s.text)
      shtml = BeautifulSoup(scr, "html.parser")
      if debug == 1:
         orenc = str(shtml.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      print('\n----------------------------------------------------------- \n')
      sfindlinks = shtml.findAll('a')
      slen = len(sfindlinks)
      sdirs = []
      si = 0
      while si < slen:
         for slink in sfindlinks:
            if debug == 1:
               print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen))
            sl = slink.get('href')
            si += 1
            if sl:
               if not re.search(r'^((\.\.)?\/)$', str(sl)):
                  if '/' in bx[-1:]:
                     if 'http' not in sl[:4]:
                        sl = sl.lstrip('/')
                        sx = bx + sl
                     else:
                        sx = sl
                     print(sx)
                     getCF(sx, 0)
                     ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
                     bs = BeautifulSoup(ss.text, "html.parser")
                     if bs is not None:                        
                        if debug == 1:
                           orenc = str(bs.original_encoding)
                           print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
                        pagehead = bs.html.head.contents
                        pagehead = str(pagehead)
                        if pagehead:
                           pagetitle = re.search(r'<title>(.*)<\/title>', pagehead)
                           pagetitle = str(pagetitle.group(1))
                           bigtitle = pagetitle.upper()
                           titlestars = lambda a: '*' * (len(str(a)) + 4)
                           pagestars = titlestars(pagetitle)
                           print('\n\033[40m\033[33m%s\n\033[34;1m* %s * \n\033[40m\033[33;21m%s\n\033[0m' % (pagestars, bigtitle, pagestars)) 
                     sb = bs.find_all('a', href = re.compile(r'.+$'))
                     #sb = bs.findAll('a')
                     sblen = len(sb)
                     if sblen > 0:
                        n = 0
                        while n < sblen:
                           for sbl in sb:
                              if debug == 1:
                                 print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen))
                              if sbl is not None:
                                 sr = sbl.get('href').strip()
                                 sr = str(sr)
                                 print('\n* %s \n') % sr
                                 if not re.search('http', sr[:4]):
                                    parent = getparent(sx)
                                    srs = sr.lstrip('/')
                                    sr = parent + srs
                                 if re.match(r'([^.]+\/)$', str(sr)):
                                    followlinks(sr)
                                    sdirs.append(sr)
                                 else:
                                    if '/' not in sr[-1:]:
                                       getCF(sr, 0)
                                       sdirs.append(sr)
                                 n += 1
                              else:
                                 n += 1
                                 continue

                  elif 'Error-222' in bx:
                     print('\nuh-oh. might have triggered a flag with cloudflare.\n')
                     for i in xrange(10,0,-1):
                        time.sleep(1)        
                        print('delaying request for %d seconds.. \r' % i)
                        sys.stdout.flush()
                     break
                  else:
                     if not re.search('http', str(sl[:4])):
                        parent = getparent(bx)
                        sl = sl.lstrip('/')
                        sx = parent + sl
                     else:
                        sx = str(sl)

                  sx = str(sx)
                  sdirs.append(sx)
                  print(sx)
                  print('\n----------------------------------------------------------- \n')              
                  getCF(sx, 0)
               si += 1

               #if re.search(r'^(.*)(\/)$', str(bx)):
            else:
               print('\nno links found at %s \n' % str(slink))
               si += 1
               continue

      for sd in sdirs:
         if '/' in sd[-1:]:
            print('\nfollowing directory: %s \n' % sd)
            followlinks(sd)
            getCF(sd, 1)
         else:
            print('\nrequesting link: %s \n' % sd)
            getCF(sd, 0)
      return sdirs
Ejemplo n.º 25
0
    def _sub_read(self, f):
        """
        Parameters
        ----------
        f : file buffer
            A file buffer for the ARFF file.

        Yields
        ------
        curr_id : str
            The current ID for the example.
        class_name : float or str
            The name of the class label for the example.
        example : dict
            The example valued in dictionary format, with 'x'
            as list of features.
        """
        field_names = []
        # Process ARFF header
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                decoded_line = UnicodeDammit(
                    line, ['utf-8', 'windows-1252']).unicode_markup
            else:
                decoded_line = line
            line = decoded_line.strip()
            # Skip empty lines
            if line:
                # Split the line using CSV reader because it can handle
                # quoted delimiters.
                split_header = self.split_with_quotes(line)
                row_type = split_header[0].lower()
                if row_type == '@attribute':
                    # Add field name to list
                    field_name = split_header[1]
                    field_names.append(field_name)
                    # Check if we're doing regression
                    if field_name == self.label_col:
                        self.regression = (len(split_header) > 2
                                           and split_header[2] == 'numeric')
                # Save relation if specified
                elif row_type == '@relation':
                    self.relation = split_header[1]
                # Stop at data
                elif row_type == '@data':
                    break
                # Skip other types of rows (relations)

        # Create header for CSV
        if PY2:
            io_type = BytesIO
        else:
            io_type = StringIO
        with io_type() as field_buffer:
            csv.writer(field_buffer, dialect='arff').writerow(field_names)
            field_str = field_buffer.getvalue()

        # Set label_col to be the name of the last field, since that's standard
        # for ARFF files
        if self.label_col != field_names[-1]:
            self.label_col = None

        # Process data as CSV file
        return super(ARFFReader, self)._sub_read(chain([field_str], f))
Ejemplo n.º 26
0
    def _sub_read(self, f):
        """
        Parameters
        ----------
        f : file buffer
            A file buffer for an MegaM file.

        Yields
        ------
        curr_id : str
            The current ID for the example.
        class_name : float or str
            The name of the class label for the example.
        example : dict
            The example valued in dictionary format, with 'x'
            as list of features.

        Raises
        ------
        ValueError
            If there are duplicate feature names.
        """
        example_num = 0
        curr_id = 'EXAMPLE_0'
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                line = UnicodeDammit(line,
                                     ['utf-8', 'windows-1252']).unicode_markup
            line = line.strip()
            # Handle instance lines
            if line.startswith('#'):
                curr_id = line[1:].strip()
            elif line and line not in ['TRAIN', 'TEST', 'DEV']:
                split_line = line.split()
                num_cols = len(split_line)
                del line
                # Line is just a class label
                if num_cols == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = []
                # Line has a class label and feature-value pairs
                elif num_cols % 2 == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = split_line[1:]
                # Line just has feature-value pairs
                elif num_cols % 2 == 0:
                    class_name = None
                    field_pairs = split_line

                curr_info_dict = {}
                if len(field_pairs) > 0:
                    # Get current instances feature-value pairs
                    field_names = islice(field_pairs, 0, None, 2)
                    # Convert values to floats, because otherwise
                    # features'll be categorical
                    field_values = (safe_float(val)
                                    for val in islice(field_pairs, 1, None, 2))

                    # Add the feature-value pairs to dictionary
                    curr_info_dict.update(zip(field_names, field_values))

                    if len(curr_info_dict) != len(field_pairs) / 2:
                        raise ValueError(
                            ('There are duplicate feature ' +
                             'names in {} for example ' + '{}.').format(
                                 self.path_or_list, curr_id))

                yield curr_id, class_name, curr_info_dict

                # Set default example ID for next instance, in case we see a
                # line without an ID.
                example_num += 1
                curr_id = 'EXAMPLE_{}'.format(example_num)
Ejemplo n.º 27
0
def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=None, formats=("srt",),
                   tags=None, path_decoder=None, debug_mods=False):
    """Save subtitles on filesystem.

    Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles
    with the same language are silently ignored.

    The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for
    the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle.

    :param file_path: video file path
    :param formats: list of "srt" and "vtt"
    :param subtitles: subtitles to save.
    :type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
    :param bool single: save a single subtitle, default is to save one subtitle per language.
    :param str directory: path to directory where to save the subtitles, default is next to the video.
    :return: the saved subtitles
    :rtype: list of :class:`~subliminal.subtitle.Subtitle`

    patch: unicode path problems
    """

    logger.debug("Subtitle formats requested: %r", formats)

    saved_subtitles = []
    for subtitle in subtitles:
        # check content
        if subtitle.content is None:
            logger.error('Skipping subtitle %r: no content', subtitle)
            continue

        # check language
        if subtitle.language in set(s.language for s in saved_subtitles):
            logger.debug('Skipping subtitle %r: language already saved', subtitle)
            continue

        # create subtitle path
        subtitle_path = get_subtitle_path(file_path, None if single else subtitle.language,
                                          forced_tag=subtitle.language.forced, tags=tags)
        if directory is not None:
            subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1])

        if path_decoder:
            subtitle_path = path_decoder(subtitle_path)

        # force unicode
        subtitle_path = UnicodeDammit(subtitle_path).unicode_markup

        subtitle.storage_path = subtitle_path

        for format in formats:
            if format != "srt":
                subtitle_path = os.path.splitext(subtitle_path)[0] + (u".%s" % format)

            logger.debug(u"Saving %r to %r", subtitle, subtitle_path)
            content = subtitle.get_modified_content(format=format, debug=debug_mods)
            if content:
                with open(subtitle_path, 'w') as f:
                    f.write(content)
                subtitle.storage_path = subtitle_path
            else:
                logger.error(u"Something went wrong when getting modified subtitle for %s", subtitle)

        # change chmod if requested
        if chmod:
            os.chmod(subtitle_path, chmod)

        saved_subtitles.append(subtitle)

        # check single
        if single:
            break

    return saved_subtitles
Ejemplo n.º 28
0
def replace_cid_and_change_headers(html, pk):
    """
    Check in the html source if there is an image tag with the attribute cid.
    Loop through the attachments that are linked with the email. If there is a
    match replace the source of the image with the cid information. After read
    the image information from the disk and put the data in a dummy header.
    At least create a plain text version of the html email.

    Args:
        html (string): HTML string of the email body to be sent.
        mapped_attachments (list): List of linked attachments to the email.
        request (instance): The Django request.

    Returns:
        body_html (string),
        body_text (string),
        dummy_headers (dict)
    """
    if html is None:
        return None

    dummy_headers = []
    inline_images = []
    soup = create_a_beautiful_soup_object(html)

    attachments = []
    if pk:
        attachments = EmailAttachment.objects.filter(message_id=pk)

    if soup and attachments:
        inline_images = soup.findAll('img', {'cid': lambda cid: cid})

    if (not soup or soup.get_text() == '') and not inline_images:
        body_html = html
    else:
        cid_done = []

        for image in inline_images:
            image_cid = image['cid']

            for file in attachments:
                if (file.cid[1:-1] == image_cid
                        or file.cid == image_cid) and file.cid not in cid_done:
                    image['src'] = "cid:%s" % image_cid

                    storage_file = default_storage._open(file.attachment.name)
                    filename = get_attachment_filename_from_url(
                        file.attachment.name)

                    if hasattr(storage_file, 'key'):
                        content_type = storage_file.key.content_type
                    else:
                        content_type = mimetypes.guess_type(
                            storage_file.file.name)[0]

                    storage_file.open()
                    content = storage_file.read()
                    storage_file.close()

                    response = {
                        'content-type': content_type,
                        'content-disposition': 'inline',
                        'content-filename': filename,
                        'content-id': file.cid,
                        'x-attachment-id': image_cid,
                        'content-transfer-encoding': 'base64',
                        'content': content
                    }

                    dummy_headers.append(response)
                    cid_done.append(file.cid)
                    del image['cid']

        body_html = soup.encode_contents()

    body_text_handler = html2text.HTML2Text()
    body_text_handler.ignore_links = True
    body_text_handler.body_width = 0
    body_text = body_text_handler.handle(html)

    # After django 1.11 update forcing the html part of the body to be unicode is needed to avoid encoding errors.
    dammit = UnicodeDammit(body_html)
    encoding = dammit.original_encoding
    if encoding:
        body_html = body_html.decode(encoding)

    return body_html, body_text, dummy_headers
Ejemplo n.º 29
0
def parse_more(page_tree):
    rmore = ''
    if page_tree.xpath('//div[@id="s_notes"]'):
        page = page_tree.xpath('//div[@id="s_notes"]')[0]
        rows = page.xpath('.//tr[@height="45px"]')
        for row in rows:
            more = row.xpath('.//td/text()')[0].strip()
            if more != "":
                rmore = rmore + more + "\n"
    return rmore


lf_data = []
r = rr_sess.post(searchURL, data=payload)
html_ud = UnicodeDammit(r.content, is_html=True)
parser = html.HTMLParser(encoding=html_ud.original_encoding)
tree = html.document_fromstring(r.content, parser=parser)
rows_number = int(tree.xpath('//div[@id="pg_stats"]/b[1]/text()')[0].strip())
pages_number = -(-rows_number // 20)
print 'Всего объектов: %d' % (rows_number)
parse_search(tree)
print "Получил страницу 1 из %d" % (pages_number)
if rows_number > 20:
    for pn in range(2, pages_number + 1):
        r = rr_sess.get(searchURL + ur'?online_request_search_page=' +
                        str(pn) + ur'#Z7_01HA1A42KG4D30A3BUVH3O0000')
        tree = html.document_fromstring(r.content, parser=parser)
        parse_search(tree)
        print "Получил страницу %d из %d" % (pn, pages_number)
with open(house_path + u"/Квартиры.csv", "wb") as f:
Ejemplo n.º 30
0
def get_data(site_code):

    url = config.get(
        'DEFAULTS',
        'weather_data_url_prefix') + '/' + site_code.upper() + config.get(
            'DEFAULTS', 'weather_data_url_file_extension')

    logger.debug('retrieval url: %s' % (url))

    # Make soup
    try:
        resp = urlopen(url)

        LastRetrieval = datetime.strptime(resp.headers['Date'],
                                          '%a, %d %b %Y %H:%M:%S %Z')
        LastModified = datetime.strptime(resp.headers['Last-Modified'],
                                         '%a, %d %b %Y %H:%M:%S %Z')

        logger.debug('web page timestamp: Last-Modified: ' +
                     resp.headers['Last-Modified'])

        contents = resp.read()
        new_contents = UnicodeDammit.detwingle(contents)
        soup = BeautifulSoup(new_contents, "html.parser")

    except URLError as e:
        logger.warn('An error occurred fetching data\n\t%s\n\t%s' %
                    (url, e.reason))
        return {}

    # Get table
    try:
        tables = soup.findAll("table")
        table = tables[3]
    except AttributeError as e:
        logger.warn('No tables found, exiting' % (url, e.reason))
        return 1
    except LookupError as e:
        logger.warn('there is no index table[3] on the page for ' + url)
        return 1
    except IndexError as e:
        logger.warn('there is no index table[3] on the page for ' + url)
        return 1

    # Get rows
    try:
        rows = table.find_all('tr')
    except AttributeError as e:
        logger.warn('No table rows found, exiting' % (url, e.reason))
        return 1

    # first two columns are created from the table
    table_columns = out_file_columns[3:len(out_file_columns)]

    # Get data
    table_data = parse_rows(rows)

    # prepare the data read from the web page
    today = datetime.now()
    month = today.month
    year = today.year
    monthedge = 0

    data_rows = {}
    for i in table_data:

        data = dict(zip(table_columns, i))

        day = data['Date']

        # this gets over month/year edges.
        if int(day) <= 2 and monthedge == 0:
            monthedge = 1

        hour, minute = data['Time'].split(':')

        my_month = -1

        # this gets over month/year edges.
        if int(day) > 2 and monthedge == 1:
            my_month = month - 1  # the month is coming from 'localtime' not the webpage
            if my_month == 0:  # january fix
                my_month = 12
                year = year - 1
        else:
            my_month = month

        obs_datetime = datetime(year, my_month, int(day), int(hour),
                                int(minute))

        data['site_code'] = site_code.upper()
        data['DateTime'] = obs_datetime.strftime('%Y-%m-%d %H:%M:00')
        data['TIMESTAMP'] = 'TS:' + data['DateTime']

        # these fields are stored in the database as numbers, but the web pages use 'NA' for missing data.  that string needs to be replaced with None
        check_field_values = ['AirTemp', 'Dewpoint', 'AirPressureAltimeter']
        for field in check_field_values:
            if data[field] == 'NA':
                data[field] = None
            elif not data[field]:
                data[field] = None

        data_rows[data['TIMESTAMP']] = data

    return [LastRetrieval, LastModified, data_rows]
Ejemplo n.º 31
0
 def decode_html(html_string):
     converted = UnicodeDammit(html_string, isHTML=True)
     if not converted.unicode:
         raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",
                                  ', '.join(converted.triedEncodings))
     return converted.unicode
Ejemplo n.º 32
0
 def extract_person_profile(hxs):
     personProfile = PersonProfileItem()
     ## Person name
     nameField = {}
     nameSpan = hxs.select("//span[@id='name']/span")
     if nameSpan and len(nameSpan) == 1:
         nameSpan = nameSpan[0]
         givenNameSpan = nameSpan.select("span[@class='given-name']")
         if givenNameSpan and len(givenNameSpan) == 1:
             givenNameSpan = givenNameSpan[0]
             nameField['given_name'] = givenNameSpan.select("text()").extract()[0]
         familyNameSpan = nameSpan.select("span[@class='family-name']")
         if familyNameSpan and len(familyNameSpan) == 1:
             familyNameSpan = familyNameSpan[0]
             nameField['family_name'] = familyNameSpan.select("text()").extract()[0]
         personProfile['name'] = nameField
     else:
         return None
     
     headline = hxs.select("//dl[@id='headline']")
     if headline and len(headline) == 1:
         headline = headline[0]
         ## locality
         locality = headline.select("dd/span[@class='locality']/text()").extract()
         if locality and len(locality) == 1:
             personProfile['locality'] = locality[0].strip()
         ## industry
         industry = headline.select("dd[@class='industry']/text()").extract()
         if industry and len(industry) == 1:
             personProfile['industry'] = industry[0].strip()
     
     ## overview
     overview = hxs.select("//dl[@id='overview']").extract()
     if overview and len(overview) == 1:
         personProfile['overview_html'] = overview[0]
         homepage = LinkedinParser.parse_homepage(overview[0])
         if homepage:
             personProfile['homepage'] = homepage
         
     ## summary
     summary = hxs.select("//div[@id='profile-summary']/div[@class='content']/p[contains(@class,'summary')]/text()").extract()
     if summary and len(summary) > 0:
         personProfile['summary'] = ''.join(x.strip() for x in summary)
     
     ## specilities
     specilities = hxs.select("//div[@id='profile-specialties']/p/text()").extract()
     if specilities and len(specilities) == 1:
         specilities = specilities[0].strip()
         personProfile['specilities'] = specilities
     
     ## skills
     skills = hxs.select("//ol[@id='skills-list']/li/span/a/text()").extract()
     if skills and len(skills) > 0:
         personProfile['skills'] = [x.strip() for x in skills]
         
     additional = hxs.select("//div[@id='profile-additional']")
     if additional and len(additional) == 1:
         additional = additional[0]
         ## interests
         interests = additional.select("div[@class='content']/dl/dd[@class='interests']/p/text()").extract()
         if interests and len(interests) == 1:
             personProfile['interests'] = interests[0].strip()
         ## groups
         g = additional.select("div[@class='content']/dl/dd[@class='pubgroups']")
         if g and len(g) == 1:
             groups = {}
             g = g[0]
             member = g.select("p/text()").extract()
             if member and len(member) > 0:
                 groups['member'] = ''.join(member[0].strip())
             gs = g.select("ul[@class='groups']/li[contains(@class,'affiliation')]/div/a/strong/text()").extract()
             if gs and len(gs) > 0:
                 groups['affilition'] = gs
             personProfile['group'] = groups
         ## honors
         honors = additional.select("div[@class='content']/dl/dd[@class='honors']/p/text()").extract()
         if honors and len(honors) > 0:
             personProfile['honors'] = [x.strip() for x in honors]
     
     ## education
     education = hxs.select("//div[@id='profile-education']")
     schools = []
     if education and len(education) == 1:
         education = education[0]
         school_list = education.select("div[contains(@class,'content')]//div[contains(@class,'education')]")
         if school_list and len(school_list) > 0:
             for school in school_list:
                 s = {}
                 name = school.select("h3[contains(@class,'org')]/text()").extract()
                 if name and len(name) == 1:
                     s['name'] = name[0].strip()
                 degree = school.select("h4[@class='details-education']/span[@class='degree']/text()").extract()
                 if degree and len(degree) == 1:
                     s['degree'] = degree[0].strip()
                 major = school.select("h4[@class='details-education']/span[@class='major']/text()").extract()
                 if major and len(major) == 1:
                     s['major'] = major[0].strip()
                 period = school.select("p[@class='period']")
                 if period and len(period) == 1:
                     period = period[0]
                     start = period.select("abbr[@class='dtstart']/text()").extract()
                     end = period.select("abbr[@class='dtend']/text()").extract()
                     if len(start) == 1:
                         s['start'] = start[0]
                     if len(end) == 1:
                         s['end'] = end[0]
                 desc = school.select("p[contains(@class,'desc')]/text()").extract()
                 if len(desc) == 1:
                     s['desc'] = desc[0].strip()
                 schools.append(s)
             personProfile['education'] = schools 
     
     ## experience
     experience = hxs.select("//div[@id='profile-experience']")
     if experience and len(experience) == 1:
         es = []
         experience = experience[0]
         exps = experience.select("//div[contains(@class,'experience')]")
         if len(exps) > 0:
             for e in exps:
                 je = {}
                 title = e.select("div[@class='postitle']//span[@class='title']/text()").extract()
                 if len(title) > 0:
                     je['title'] = title[0].strip()
                 org = e.select("div[@class='postitle']//span[contains(@class,'org')]/text()").extract() 
                 if len(org) > 0:
                     je['org'] = org[0].strip()
                 start = e.select("p[@class='period']/abbr[@class='dtstart']/text()").extract()
                 if len(start) > 0:
                     je['start'] = start[0].strip()
                 end = e.select("p[@class='period']/abbr[@class='dtstamp']/text()").extract()
                 if len(end) > 0:
                     je['end'] = end[0].strip()
                 location = e.select("p[@class='period']/abbr[@class='location']/text()").extract()
                 if len(location) > 0:
                     je['location'] = location[0]
                 desc = e.select("p[contains(@class,'description')]/text()").extract()
                 if len(desc) > 0:
                     je['desc'] = "".join(x.strip() for x in desc)
                 es.append(je)
         personProfile['experience'] = es
                 
     ## Also view
     alsoViewProfileList = []
     divExtra = hxs.select("//div[@id='extra']")
     if divExtra and len(divExtra) == 1:
         divExtra = divExtra[0]
         divAlsoView = divExtra.select("//div[@class='leo-module mod-util browsemap']")
         if divAlsoView and len(divAlsoView) == 1:
             divAlsoView = divAlsoView[0]
             alsoViewList = divAlsoView.select("div[@class='content']/ul/li/strong/a/@href").extract()
             if alsoViewList:
                 for alsoViewItem in alsoViewList:
                     alsoViewItem = UnicodeDammit(alsoViewItem).markup
                     item = HtmlParser.get_also_view_item(alsoViewItem)
                     alsoViewProfileList.append(item)
                 personProfile['also_view'] = alsoViewProfileList
     return personProfile
Ejemplo n.º 33
0
	def decode_html(html_string):
		converted = UnicodeDammit(html_string)
		if not converted.unicode_markup:
			raise UnicodeDecodeError("Failed to detect encoding, tried [%s]",', '.join(converted.tried_encodings))
# print converted.original_encoding
		return converted.unicode_markup
Ejemplo n.º 34
0
	def on_pubmsg(self, c, e):
		nick = e.source.nick
		target = e.target if is_channel(e.target) else nick
		def reply(msg):
			self.send(target, msg)
		def dm(msg):
			self.send(nick, msg)
		line = UnicodeDammit(e.arguments[0]).unicode_markup
		log('   \033[37m{}→{}\033[0m'.format(nick, line))
		a = line.split(":", 1)
		if len(a) > 1 and a[0].lower() == self.nick:
			self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
			return

		# zeltofilter
		if 'zeltoph' in nick:
			return

		foo = settings.VIPS.get(nick, 0)
		if random() < foo:
			self.kick(nick)
	
		match = re.match('.*┻━┻.*', line)
		if match:
			reply('┬─┬ノ(ಠ_ಠノ)')
			return

		match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
		if match:
			newcs = match.group(3)
			self.chaossternchen.append(newcs)
			self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
			return

		if line.startswith('.wiki '):
			wikipage = line[len('.wiki '):].strip()
			if re.match('^[-_+\w]+$', wikipage):
				wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
				if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
					reply("I'm sorry, I can't find a wiki page with that name.")
				else:
					reply(wikiurl)
			else:
				reply('Try to troll somebot else.')
			return

		if line == 'wat?':
			reply("I don't have a clue.")
			return
		if re.match('^hail eris[.!]* ', line.lower()):
			reply("All Hail Discordia!")
			return
		m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
				self.kick(nick, "It's spelled Gandhi")
				return
		if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
			reply('A facebook link? srsly? Get some self-respect!')
			return
		match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
		if match:
			reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
			return
		if line == 'moin':
			self.moincount += 1
			if self.moincount == 5:
				reply('moin')
			return
		else:
			self.moincount = 0
		if line.lstrip('.!#').startswith('eta '):
			eta = line[4:].strip()
			with self.db as db:
				db.execute("DELETE FROM etas WHERE nick=?", (nick,))
				if eta:
					db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
			dm('ETA registered. Thanks!')
			return
		m = re.findall(URL_REGEX, line.lower())
		for url,*_ in m:
			res = requests.get(url)
			if res.status_code == requests.codes.ok:
				soup = BeautifulSoup(res.text)
				reply(soup.title.string)
		m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if match != 'AfRA' and match != 'afra' and random() < 0.1:
				reply("I'm sure you meant AfRA, not "+match)
				return
Ejemplo n.º 35
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .ndj, or .tsv)')
    parser.add_argument('-i', '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    label_group = parser.add_mutually_exclusive_group(required=False)
    label_group.add_argument('-l',
                             '--label_col',
                             help='Name of the column which contains the class \
                                   labels in ARFF, CSV, or TSV files. For ARFF \
                                   files, this must be the final column to count as\
                                   the label.',
                             default='y')
    label_group.add_argument('--no_labels',
                             action='store_true',
                             default=False,
                             help='Used to indicate that the input data has no labels.')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip().split())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    label_col = None if args.no_labels else args.label_col

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is CSVWriter or writer_type is TSVWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
def end_text():
    text_of_qoute = UnicodeDammit(
        str(all_quotes[random.randint(1, len(all_quotes)) - 1].find(
            "div", class_="text"))[18:-6].replace("<br/>", "\n").replace(
                "<br>", "").replace("</br>", ""))
    return text_of_qoute.unicode_markup
Ejemplo n.º 37
0
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
url = "http://www.weather.com.cn/weather/101190301.shtml"
try:
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
    }
    req = urllib.request.Request(url, headers=headers)
    data = urllib.request.urlopen(req)
    data = data.read()
    dammit = UnicodeDammit(data, ["utf-8", "gbk"])
    data = dammit.unicode_markup
    soup = BeautifulSoup(data, 'lxml')
    lis = soup.select("ul[class='t clearfix'] li")
    for li in lis:
        try:
            data = li.select('h1')[0].text
            weather = li.select('p[class="wea"]')[0].text
            temp = li.select('p[class="tem"] span')[0].text + "/" + li.select(
                'p[class="tem"] i')[0].text
            print(data, weather, temp)
        except Exception as err:
            print(err)
except Exception as err:
    print(err)
Ejemplo n.º 38
0
    def _whois_domain(self, param):

        config = self.get_config()
        server = config.get(phantom.APP_JSON_SERVER, None)

        domain = param[phantom.APP_JSON_DOMAIN]

        action_result = self.add_action_result(ActionResult(dict(param)))
        action_result.set_param({phantom.APP_JSON_DOMAIN: domain})

        # This sleep is required between two calls, else the server might
        # throttle the queries when done in quick succession, which leads
        # to a 'Connection reset by peer' error.
        # Sleep before doing anything (as opposed to after), so that even
        # if this action returns an error, the sleep will get executed and
        # the next call will get executed after this sleep
        time.sleep(1)

        try:
            domain = self._get_domain(domain)
        except Exception as e:
            error_message = self._get_error_message_from_exception(e)
            return action_result.set_status(phantom.APP_ERROR, WHOIS_ERR_PARSE_INPUT, error_message)

        self.debug_print("Validating/Querying Domain {0}".format(repr(domain)))

        action_result.update_summary({phantom.APP_JSON_DOMAIN: domain})

        self.save_progress("Querying...")

        pythonwhois.parse.registrant_regexes.extend(REGISTRANT_REGEXES)
        pythonwhois.parse.admin_contact_regexes.extend(ADMIN_CONTACT_REGEXES)
        pythonwhois.parse.tech_contact_regexes.extend(TECH_CONTACT_REGEXES)
        pythonwhois.parse.billing_contact_regexes.extend(BILLING_CONTACT_REGEXES)

        # 1. Attempting to fetch the whois information with the server
        # if provided or without it if not provided
        whois_response = self._fetch_whois_info(action_result, domain, server)

        if whois_response is None:
            return action_result.get_status()

        # 2. Attempting to fetch the whois information with the server obtained
        # in the output response of the first step above
        if whois_response.get('contacts') and not whois_response.get('contacts').get('registrant'):
            if whois_response.get('whois_server'):
                resp_server = UnicodeDammit(whois_response.get('whois_server')[0]).unicode_markup.encode('utf-8')

                whois_response = self._fetch_whois_info(action_result, domain, resp_server)

                if whois_response is None:
                    return action_result.get_status()
            else:
                self.debug_print("No second API call required as the server information could not be fetched from the first WHOIS API call")

        self.save_progress("Parsing response")

        try:
            # Need to work on the json, it contains certain fields that are not
            # parsable, so will need to go the 'fallback' way.
            # TODO: Find a better way to do this
            whois_response = json.dumps(whois_response, default=_json_fallback)
            whois_response = json.loads(whois_response)
            action_result.add_data(whois_response)
        except Exception as e:
            error_message = self._get_error_message_from_exception(e)
            return action_result.set_status(phantom.APP_ERROR, WHOIS_ERR_PARSE_REPLY, error_message)

        # Even if the query was successfull the data might not be available
        if self._response_no_data(whois_response, domain):
            return action_result.set_status(phantom.APP_SUCCESS, '{}, but, {}.'.format(WHOIS_SUCC_QUERY, WHOIS_ERR_QUERY_RETURNED_NO_CONTACTS_DATA))
        else:
            # get the registrant
            if whois_response.get('contacts') and whois_response.get('contacts').get('registrant'):
                registrant = whois_response['contacts']['registrant']
                wanted_keys = ['organization', 'name', 'city', 'country']
                summary = {x: registrant[x] for x in wanted_keys if x in registrant}
                action_result.update_summary(summary)
                action_result.set_status(phantom.APP_SUCCESS)
            else:
                action_result.set_status(phantom.APP_SUCCESS, '{}, but, {}.'.format(WHOIS_SUCC_QUERY, WHOIS_SUCC_QUERY_RETURNED_NO_REGISTRANT_DATA))

        return phantom.APP_SUCCESS
Ejemplo n.º 39
0
img=[]
links=[]
imagelinks=[]
imagewidths=[]
imageheights=[]
con=pymysql.connect(host='localhost',
                    user='******',
                    password='******',
                    db='myFlaskApp',
                    charset='utf8',
                    cursorclass=pymysql.cursors.DictCursor)
cur=con.cursor()
cur.execute('DROP TABLE IF EXISTS datacities')
cur.execute('CREATE TABLE datacities(id INT(11) AUTO_INCREMENT PRIMARY KEY,heading VARCHAR(1000),link VARCHAR(1000),imagelink VARCHAR(1000),imagewidth INT(11),imageheight INT(11),article VARCHAR(13000))')
for i in range(0,total_num-1):
	dammit=UnicodeDammit(title[i].get_text())
	headings.append(dammit.unicode_markup)
	links.append(title[i].find('a')['href'])
	img=temp.find_all('div',{'class':'snaps'})
	imagelinks.append(img[i].find('img')['data-lazy-src'])
	imageheights.append(img[i].find('img')['height'])
	imagewidths.append(img[i].find('img')['width'])
	reqpage=requests.get(links[i])
	reqsoup=BeautifulSoup(reqpage.content,'html.parser')
	yo=reqsoup.find('div',{'class':'articles'}).findAll('p')
	length=len(yo)
	mypara=''
	for j in range(0,length-1):
		dammit=UnicodeDammit(yo[j].get_text().encode('utf8'))
		pp=str(dammit.unicode_markup)
		mypara=str(mypara+"\n"+pp)
Ejemplo n.º 40
0
def maybe_convert(record, domain):
    """Converts a WARC record to JSON rows for the Page and Html tables.

  Arg:
    record: warcio.Record
    domain: string

  Returns:
    dict, JSON Page record, or None
  """
    if record.rec_type != 'response':
        return

    if (record.http_headers.get_statuscode() != '200'
            or not record.http_headers.get('Content-Type',
                                           '').startswith('text/html')):
        return

    url = record.rec_headers.get('WARC-Target-URI')
    if url in seen_urls or blacklist.URL_BLACKLIST_RE.search(url):
        return

    assert domain
    url_domain = urlparse(url).netloc.split(':')[0]
    if url_domain != domain and not url_domain.endswith('.' + domain):
        return

    row = {
        'domain':
        url_domain,
        'url':
        url,
        'fetch_time':
        record.rec_headers.get('WARC-Date'),
        'rels': [],  # placeholders so that key order is preserved
        'u_urls': [],
        'mf2_classes': [],
        'mf2':
        '{}',
        'headers': [{
            'name': name,
            'value': value
        } for name, value in sorted(record.http_headers.headers)],
    }
    content_length = record.http_headers.get('Content-Length')
    if content_length and int(content_length) > MAX_ROW_SIZE:
        row.update({
            'html': MAX_ROW_MESSAGE,
            'mf2': json.dumps({MAX_ROW_MESSAGE: None}),
        })
        return row

    # TODO: charset from HTTP header Content-Type
    #
    # use UnicodeDammit to gracefully handle response contents with invalid
    # content for their character encoding, e.g. invalid start or continuation
    # bytes in UTF-8.
    body_bytes = record.content_stream().read()
    body = UnicodeDammit(body_bytes).unicode_markup
    if not body:
        return

    if url in seen_urls:
        return
    seen_urls.add(url)

    soup = BeautifulSoup(body, 'lxml')

    links = [
        {
            'tag': link.name,
            'url': link['href'],
            'inner_html':
            ''.join(str(c) for c in link.children),  # inner HTML content
            'rels': link.get('rel', []),
            'classes': link.get('class', []),
        } for link in soup.find_all('link') + soup.find_all('a')
        if link.get('href')
    ]

    row.update({
        'links':
        links[:MAX_LINKS],
        # heuristic: check that HTML is <= 1/2 max size to avoid cost of serializing
        # this whole JSON object just to check its length.
        'html':
        body if len(body_bytes) <= MAX_ROW_SIZE / 2 else MAX_ROW_MESSAGE,
    })

    try:
        mf2 = mf2py.parse(url=url, doc=soup)
    except Exception as e:
        print('mf2py.parse with lxml failed on %s; switching to html5lib: %s' %
              (url, e))
        try:
            mf2 = mf2py.parse(url=url, doc=BeautifulSoup(body, 'html5lib'))
        except Exception as e2:
            print('mf2py.parse with html5lib failed too, giving up: %s' % e2)
            return row

    def mf2_classes(obj):
        if isinstance(obj, (list, tuple)):
            return sum((mf2_classes(elem) for elem in obj), [])
        elif isinstance(obj, dict):
            items = obj.get('items') or obj.get('children') or []
            return obj.get('type', []) + mf2_classes(items)
        raise RuntimeError('unexpected type: %r' % obj)

    mf2_str = json.dumps(mf2 or {})
    row.update({
        'rels': [{
            'value': val,
            'urls': urls
        } for val, urls in mf2.get('rels', {}).items()],
        'u_urls':
        get_urls(mf2.get('items', [])),
        'mf2_classes':
        sorted(set(mf2_classes(mf2))),
        'mf2': (mf2_str if len(mf2_str) <= MAX_ROW_SIZE / 2 else json.dumps(
            {MAX_ROW_MESSAGE: None})),
    })
    return row
Ejemplo n.º 41
0
async def scrape_page(page, feed_id, loop):
    # Connect to database
    session = models.Session()

    print('Scrape initiated for page ' + str(page) + ' of Hacker News.')

    # Get current UTC time in seconds
    now = int(datetime.utcnow().strftime('%s'))

    # Get HTML tree from feed page
    feed_html = requests.get('https://news.ycombinator.com/news?p=' +
                             str(page))

    feed_content = feed_html.content

    feed_soup = BeautifulSoup(feed_content, 'html.parser')

    # Get all post rows from HTML tree
    post_rows = feed_soup.find_all('tr', 'athing')

    for post_row in post_rows:
        # Get subtext row with additional post data
        subtext_row = post_row.next_sibling

        # Get post id
        post_id = post_row.get('id')

        # Check if post exists in database
        post_exists = session.query(
            models.Post.id).filter_by(id=post_id).scalar()

        # Get core post data if it is not in database already
        if not post_exists:
            # Get UTC timestamp for post's posting time by subtracting the
            # number of days/hours/minutes ago given on the webpage from the
            # current UTC timestamp
            time_unit = subtext_row.find('span', 'age').a.get_text().split()[1]

            if 'day' in time_unit:
                created = now - 86400 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            elif 'hour' in time_unit:
                created = now - 3600 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            else:
                created = now - 60 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            created = time.strftime('%Y-%m-%d %H:%M', time.localtime(created))

            # Get post's link
            link = post_row.find('a', 'storylink').get('href')

            # Get post's title
            title = post_row.find('a', 'storylink').get_text()

            # Set post's type based on title
            if 'Show HN:' in title:
                type = 'show'
            elif 'Ask HN:' in title:
                type = 'ask'
            else:
                type = 'article'

            # Get username of user who posted post or set as blank for job
            # posting
            if subtext_row.find('a', 'hnuser'):
                username = subtext_row.find('a', 'hnuser').get_text()
            else:
                username = ''

            # Get website that post is from or set as blank for ask posting
            if post_row.find('span', 'sitestr'):
                website = post_row.find('span', 'sitestr').get_text()
            else:
                website = ''

            # Add post data to database
            post = models.Post(created=created,
                               id=post_id,
                               link=link,
                               title=title,
                               type=type,
                               username=username,
                               website=website)

            session.add(post)

        # Get post's comment count if it is listed (otherwise, set to 0)
        if 'comment' in subtext_row.find_all(href='item?id=' +
                                             post_id)[-1].get_text():
            unicode_count = UnicodeDammit(
                subtext_row.find_all(href='item?id=' + post_id)[-1].get_text())
            comment_count = unicode_count.unicode_markup.split()[0]
        else:
            comment_count = 0

        # Get post's rank on feed page
        feed_rank = post_row.find('span', 'rank').get_text()[:-1]

        # Get post's score if it is listed (otherwise, post is job posting)
        if subtext_row.find('span', 'score'):
            point_count = subtext_row.find('span',
                                           'score').get_text().split()[0]
        else:
            point_count = 0
            type = 'job'

        # Add feed-based post data to database
        feed_post = models.FeedPost(comment_count=comment_count,
                                    feed_id=feed_id,
                                    feed_rank=feed_rank,
                                    point_count=point_count,
                                    post_id=post_id)

        session.add(feed_post)

        session.commit()

        # Create asynchronous task to scrape post page for its comments
        loop.create_task(scrape_post(post_id, feed_id, loop, None))

    return
Ejemplo n.º 42
0
    def _import(self, message):
        """import <url> [<alias(es)>] - imports all aliases from the given address, or only the listed aliases"""
        if message.User.Name not in GlobalVars.admins:
            return IRCResponse(ResponseType.Say,
                               u"Only my admins may import aliases!",
                               message.ReplyTo)
        if len(message.ParameterList) < 2:
            return IRCResponse(ResponseType.Say,
                               u"You didn't give a url to import from!",
                               message.ReplyTo)

        if len(message.ParameterList) > 2:
            onlyListed = True
            importList = [alias.lower() for alias in message.ParameterList[2:]]
        else:
            onlyListed = False

        url = message.ParameterList[1]
        try:
            page = WebUtils.fetchURL(url)
        except ValueError:
            return IRCResponse(ResponseType.Say,
                               u"'{}' is not a valid URL".format(url),
                               message.ReplyTo)
        if page is None:
            return IRCResponse(ResponseType.Say,
                               u"Failed to open page at {}".format(url),
                               message.ReplyTo)

        text = page.body
        text = UnicodeDammit(text).unicode_markup
        lines = text.splitlines()
        numAliases = 0
        numHelpTexts = 0
        for lineNumber, line in enumerate(lines):
            # Skip over blank lines
            if line == u"":
                continue
            splitLine = line.split()
            if splitLine[0].lower() != u"{}alias".format(self.bot.commandChar):
                return IRCResponse(ResponseType.Say,
                                   u"Line {} at {} does not begin with {}alias".format(lineNumber,
                                                                                       url,
                                                                                       self.bot.commandChar),
                                   message.ReplyTo)
            subCommand = splitLine[1].lower()
            if subCommand not in [u"add", u"help"]:
                return IRCResponse(ResponseType.Say,
                                   u"Line {} at {} is not an add or help command".format(lineNumber, url),
                                   message.ReplyTo)

            aliasName = splitLine[2].lower()
            aliasCommand = splitLine[3:]
            aliasCommand[0] = aliasCommand[0].lower()

            # Skip over aliases that weren't listed, if any were listed
            if onlyListed and aliasName not in importList:
                continue

            if subCommand == u"add":
                self._newAlias(aliasName, aliasCommand)
                numAliases += 1
            elif subCommand == u"help":
                aliasHelp = u" ".join(splitLine[3:])
                self.aliasHelpDict[aliasName] = aliasHelp
                numHelpTexts += 1

        return IRCResponse(ResponseType.Say,
                           u"Imported {} alias(es) and {} help string(s) from {}".format(numAliases,
                                                                                         numHelpTexts,
                                                                                         url),
                           message.ReplyTo)
Ejemplo n.º 43
0
def wordprocessing(self, database, language, lemmatizer, news_comments,
                   news_comments_start_date, news_comments_end_date,
                   exclude_vowels, stopwords, stemmer, upload_textarea,
                   upload_option, ignore_results_amount, upload_url):

    self.total_steps = 7

    self.current_step = 1
    self.cs_name = "Initializing"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    #Language check
    if language not in ['english', 'dutch']:
        return {'status': 'error', 'message': "Invalid language!"}

    if database not in connections:
        return {'status': 'error', 'message': "Invalid database!"}

    self.current_step = 2
    self.cs_name = "Normalizing Input"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    #Input normalization
    if upload_option == 'text_field':
        input_text = upload_textarea
    elif upload_option == 'url':
        page_text = requests.get(upload_url).text
        soup = BeautifulSoup(page_text, "html.parser")
        input_text = soup.text
    elif upload_option == 'file':
        input_text = UnicodeDammit(upload_file.read()).unicode_markup
    elif upload_option == 'news_comments':
        start_date_text = news_comments_start_date
        end_date_text = news_comments_end_date
        start_date = datetime.date(
            *[int(i) for i in start_date_text.split('-')])
        end_date = datetime.date(*[int(i) for i in end_date_text.split('-')])
        filters = {
            'date__gte': start_date,
            'date__lte': end_date,
            'text__isnull': False
        }
        input_text = ""
        if news_comments in ['news', 'news_comments']:
            self.cs_name = "Normalizing Input - Reading Newsitems"
            queryset = Newsitem.objects\
                               .using(database)\
                               .filter(**filters)\
                               .select_related('text')
            self.cs_total_amount = queryset.count()
            for newsitem in queryset:
                input_text += "\n" + newsitem.text.text
                self.increment_amount_done()
        if news_comments in ['comments', 'news_comments']:
            self.cs_name = "Normalizing Input - Reading Comments"
            queryset = Comment.objects\
                       .using(database)\
                       .filter(**filters)\
                       .select_related('text')
            self.cs_total_amount = queryset.count()
            for comment in queryset:
                input_text += "\n" + comment.text.text
                self.increment_amount_done()

    #Stemmer selection
    if stemmer == 'no_stemmer':
        stemmer = None
    elif stemmer == 'porter':
        if language != 'english':
            return jsonify(status='error',
                           message="Invalid language for stemmer porter!")
        stemmer = PorterStemmer()
    elif stemmer == 'snowball':
        stemmer = SnowballStemmer(language)
    else:
        return jsonify(status='error', message="Invalid stemmer!")

    #Lemmatizer selection
    if lemmatizer == 'lemmatizer_off':
        lemmatizer = None
    elif language == 'english':
        lemmatizer = lemmatizer_en
    else:
        lemmatizer = lemmatizer_nl

    #Stopwords selection
    if stopwords == 'no_stopwords':
        stopwords = None
    elif stopwords == 'our_stopwords':
        stopwords = obo.stopwords
    elif stopwords == 'custom_stopwords':
        custom_stopword_text = UnicodeDammit(
            input_json.get('custom_stopword_file').read()).unicode_markup
        stopwords = obo.stripNonAlphaNum(custom_stopword_text)

    self.current_step = 3
    self.cs_name = "Wordlist creation"
    self.reset_amount()
    self.cs_total_amount = len(input_text)
    self.update_meta()

    #Process the text
    input_text_word_count = 0
    resulting_text = ""
    final_wordlist = []
    for word_type, word in text_processor.parse_text(input_text):
        if word_type == "non-word":
            resulting_text += word
        else:
            input_text_word_count += 1
            processed_word = word
            if stemmer:
                processed_word = stemmer.stem(processed_word)
            if lemmatizer:
                processed_word = lemmatizer(processed_word)
            if not stopwords or processed_word not in stopwords:
                if exclude_vowels == 'exclude_vowels_yes':
                    if language == 'english':
                        regex = re_vowel_en
                    else:
                        regex = re_vowel_nl
                    processed_word = regex.sub("", processed_word)
                resulting_text += processed_word
                final_wordlist.append(processed_word)
        self.cs_amount_done += len(word)
        self.calculate_status_update()

    self.current_step = 4
    self.cs_name = "obo.wordListToFreqDict"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    dictionary = obo.wordListToFreqDict(final_wordlist)

    self.current_step = 5
    self.cs_name = "obo.sortFreqDict"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    sorteddict = obo.sortFreqDict(dictionary)

    self.current_step = 6
    self.cs_name = "Dealing with Ignored Results"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    ignore_results_amount = int(ignore_results_amount)

    if ignore_results_amount > 0:
        self.cs_total_amount = len(resulting_text)
        initial_index = ignore_results_amount
        ignored_words = [word for rank, word in sorteddict[:initial_index]]
        sorteddict = sorteddict[initial_index:]
        new_text = ""
        new_wordlist = []
        for word_type, word in text_processor.parse_text(resulting_text):
            if word_type == "non-word":
                new_text += word
            elif word not in ignored_words:
                new_text += word
                new_wordlist.append(word)
            self.cs_amount_done += len(word)
            self.calculate_status_update()
        resulting_text = new_text
        final_wordlist = new_wordlist
    else:
        initial_index = 0

    self.current_step = 7
    self.cs_name = "Doing the math"
    self.reset_amount()
    self.cs_total_amount = 0
    self.update_meta()

    input_text_char_count = len(input_text)
    word_count = len(final_wordlist)
    distinct_words_count = len(sorteddict)
    words = []
    frequencies = []
    word_cloud = []
    for frequency, word in sorteddict:
        words.append(word)
        frequencies.append(frequency)
        word_cloud.append([word, frequency])

    acum_perc = Decimal(0)
    percentages = []
    acum_perc_list = []
    for freq in frequencies:
        perc = Decimal((freq * 100.0) / word_count)
        percentages.append(round(perc, 2))
        acum_perc += perc
        acum_perc_list.append(round(acum_perc, 2))

    logarithms = []
    for i in range(len(sorteddict)):
        logarithms.append((math.log(i + 1), math.log(frequencies[i])))

    #Calculate Linear regression
    #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq
    x = numpy.array([math.log(f) for f in frequencies])
    y = numpy.array(
        [math.log(rank) for rank in range(1, distinct_words_count + 1)])
    A = numpy.vstack([x, numpy.ones(len(x))]).T
    m, c = numpy.linalg.lstsq(A, y)[0]

    #Calculate the regression line start and end,
    #  and sort making the start be the one with the lower X value
    #  (highcharts requires this)
    regline_start = (0, c)
    regline_end = (math.log(distinct_words_count),
                   math.log(distinct_words_count) * m + c)
    regression_line = {'start': regline_start, 'end': regline_end}

    return {
        'results': {
            'status': 'success',
            'words': words,
            'frequencies': frequencies,
            'percentages': percentages,
            'acum_perc_list': acum_perc_list,
            'logarithms': logarithms,
            'regression_line': regression_line,
            'resulting_text': resulting_text,
            'input_text_char_count': input_text_char_count,
            'input_text_word_count': input_text_word_count,
            'output_text_word_count': word_count,
            'word_cloud': word_cloud,
            'sorteddict': sorteddict
        }
    }
Ejemplo n.º 44
0
 def process_txt(self, fileobj):
     return UnicodeDammit.detwingle(fileobj.read())
Ejemplo n.º 45
0
    def _sub_read(self, f):
        """
        Parameters
        ----------
        f : file buffer
            A file buffer for an LibSVM file.

        Yields
        ------
        curr_id : str
            The current ID for the example.
        class_name : float or str
            The name of the class label for the example.
        example : dict
            The example valued in dictionary format, with 'x'
            as list of features.

        Raises
        ------
        ValueError
            If line does not look like valid libsvm format.
        """
        for example_num, line in enumerate(f):
            curr_id = ''
            # Decode line if it's not already str
            if isinstance(line, bytes):
                line = UnicodeDammit(line,
                                     ['utf-8', 'windows-1252']).unicode_markup
            match = self.line_regex.search(line.strip())
            if not match:
                raise ValueError('Line does not look like valid libsvm format'
                                 '\n{}'.format(line))
            # Metadata is stored in comments if this was produced by SKLL
            if match.group('comments') is not None:
                # Store mapping from feature numbers to names
                if match.group('feat_map'):
                    feat_map = {}
                    for pair in match.group('feat_map').split():
                        number, name = pair.split('=')
                        for orig, replacement in \
                                LibSVMReader.LIBSVM_REPLACE_DICT.items():
                            name = name.replace(orig, replacement)
                        feat_map[number] = name
                else:
                    feat_map = None
                # Store mapping from label/class numbers to names
                if match.group('label_map'):
                    label_map = dict(
                        pair.split('=')
                        for pair in match.group('label_map').strip().split())
                else:
                    label_map = None
                curr_id = match.group('example_id').strip()

            if not curr_id:
                curr_id = 'EXAMPLE_{}'.format(example_num)

            class_num = match.group('label_num')
            # If we have a mapping from class numbers to labels, get label
            if label_map:
                class_name = label_map[class_num]
            else:
                class_name = class_num
            class_name = safe_float(class_name, replace_dict=self.class_map)

            curr_info_dict = dict(
                self._pair_to_tuple(pair, feat_map)
                for pair in match.group('features').strip().split())

            yield curr_id, class_name, curr_info_dict
Ejemplo n.º 46
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i', '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    label_group = parser.add_mutually_exclusive_group(required=False)
    label_group.add_argument('-l',
                             '--label_col',
                             help='Name of the column which contains the class \
                                   labels in ARFF, CSV, or TSV files. For ARFF \
                                   files, this must be the final column to count as\
                                   the label.',
                             default='y')
    label_group.add_argument('--no_labels',
                             action='store_true',
                             default=False,
                             help='Used to indicate that the input data has no labels.')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip().split())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    label_col = None if args.no_labels else args.label_col

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is CSVWriter or writer_type is TSVWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
Ejemplo n.º 47
0
print()
print("Here we go for some kickass movie script parsing!")
print()
print()
print("Start by telling me when the introduction will end.")

for block in script_text.descendants:
    # Si block est une instance de bs4.Tag, il est entouré de balises HTML
    # Le prochain block contiendra le même texte sans les balises
    # Donc on continue sans parser ce bloc
    if (isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if (re.search('\w', block) == None):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if (re.search('\w', line) == None):
            continue

        print(
Ejemplo n.º 48
0
def extract_css(html_input, basename='sample.html', prettify_html=False):
    """Scan `html_input` and replace all styles with single link to a CSS
    file.

    Returns tuple ``<MODIFIED_HTML>, <CSS-CODE>``.

    If the `html_input` contains any ``<style>`` tags, their content
    is aggregated and returned in ``<CSS-CODE``.

    The tags are all stripped from `html` input and replaced by a link
    to a stylesheet file named ``<basename>.css``. Any extension in
    `basename` is stripped. So ``sample.html`` as `basename` will
    result in a link to ``sample.css``. The same applies for a
    `basename` ``sample.css`` or ``sample``. The modified HTML code is
    returned as first item of the result tuple.

    If `pretify_html` is True, the generated HTML code is prettified
    by BeautifulSoup. This might result in unexpected, visible gaps in
    rendered output.
    """
    # create HTML massage that removes CDATA and HTML comments in styles
    for fix, m in CDATA_MASSAGE:
        html_input = fix.sub(m, html_input)
    soup = BeautifulSoup(html_input, 'html.parser')
    css = '\n'.join([style.text for style in soup.findAll('style')])
    if '<style>' in css:
        css = css.replace('<style>', '\n')

    # lowercase leading tag names
    css = re.sub(RE_CSS_TAG,
                 lambda match: match.group(1).lower() + match.group(2) + '{',
                 css)

    # set indent of all CSS statement lines to nil.
    css = re.sub(RE_CSS_STMT_START, lambda match: '\n' + match.group(1), css)

    # insert spaces after and before curly brackets.
    css = re.sub(RE_CURLY_OPEN, lambda match: '{ ' + match.group(1), css)
    css = re.sub(RE_CURLY_CLOSE, lambda match: match.group(1) + ' }', css)
    css_name = os.path.splitext(basename)[0] + '.css'

    # Remove empty style comments
    css = re.sub(RE_EMPTY_COMMENTS, lambda match: '', css)

    if css.startswith('\n'):
        css = css[1:]

    for num, style in enumerate(soup.findAll('style')):
        if num == 0 and css != '':
            # replace first style with link to stylesheet
            # if there are any styles contained
            new_tag = soup.new_tag('link',
                                   rel='stylesheet',
                                   type='text/css',
                                   href=css_name)
            style.replace_with(new_tag)
        else:
            style.extract()
    if css == '':
        css = None
    if prettify_html:
        return soup.prettify(), css
    return UnicodeDammit(str(soup)).markup, css
Ejemplo n.º 49
0
def get_file_encoding(file_path):
    with open(file_path, 'rb') as file:
        content = file.read()
    suggestion = UnicodeDammit(content)
    return suggestion.original_encoding
Ejemplo n.º 50
0
Archivo: irc.py Proyecto: lae/Servrhe
def normalize(s):
    try:
        u = UnicodeDammit.detwingle(s).decode("utf8")
    except:
        u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup
    return u
Ejemplo n.º 51
0
Archivo: s.py Proyecto: lite/MyTestBox
 def slim_html(self, raw_html):
     doc = UnicodeDammit.detwingle(raw_html)
     soup = BeautifulSoup(doc, "html5lib", from_encoding="utf-8")
     return soup.prettify().encode("utf-8");
Ejemplo n.º 52
0
    def post(self, group_id):
        """Automatically registers several student accounts based on a CSV."""
        group = api.group.get_group(gid=group_id)
        if not group:
            raise PicoException('Classroom not found', 404)

        curr_user = api.user.get_user()
        if (curr_user['tid'] not in (group['teachers'] + [group['owner']])
                and not curr_user['admin']):
            raise PicoException(
                'You do not have permission to batch-register students into ' +
                'this classroom.', status_code=403
            )

        # Load in student demographics from CSV
        req = batch_registration_req.parse_args(strict=True)
        students = []
        unicoded_csv = UnicodeDammit(req['csv'].read())  # Forcibly unicodify
        csv_reader = csv.DictReader(
            unicoded_csv.unicode_markup.split('\n'))
        try:
            for row in csv_reader:
                row = {k: v.strip() for k, v in row.items()}  # Trim whitespace
                students.append(row)
        except csv.Error as e:
            raise PicoException(
                f"Error reading CSV at line {csv_reader.line_num}: {e}",
                status_code=400)

        # Check whether registering these students would exceed maximum
        # batch registrations per teacher account
        config = api.config.get_settings()
        teacher_metadata = api.token.find_key({
            'uid': api.user.get_user()['uid']
        })
        if not teacher_metadata:
            existing_batch_count = 0
        else:
            existing_batch_count = teacher_metadata.get(
                "tokens", {}).get('batch_registered_students', 0)
        potential_batch_count = existing_batch_count + len(students)
        if (potential_batch_count > config['max_batch_registrations']):
            raise PicoException(
                "You have exceeded the maximum number of batch-registered " +
                "student accounts. Please contact an administrator.", 403
            )

        # Validate demographics
        def validate_current_year(s):
            try:
                n = int(s)
                if not (1 <= n <= 12):
                    raise ValueError
            except ValueError:
                raise ValidationError(
                    f'Grade must be between 1 and 12 (provided {s})')

        class BatchRegistrationUserSchema(Schema):
            # Convert empty strings to Nones when doing validation
            # to allow optional parent_email value for age 18+,
            # but back to '' before storing in database.
            @pre_load
            def empty_to_none(self, in_data, **kwargs):
                for k, v in in_data.items():
                    if v == "":
                        in_data[k] = None
                return in_data

            @post_load
            def none_to_empty(self, in_data, **kwargs):
                for k, v in in_data.items():
                    if v is None:
                        in_data[k] = ''
                return in_data
            current_year = fields.Str(
                data_key='Grade (1-12)',
                required=True,
                validate=validate_current_year)
            age = fields.Str(
                data_key='Age (13-17 or 18+)', required=True,
                validate=validate.OneOf(choices=['13-17', '18+']))
            gender = fields.Str(
                data_key="Gender", required=False, allow_none=True,
                validate=validate.OneOf(
                    ['male', 'female', 'nb/gf', 'nl/no'],
                    ['Male', 'Female', 'Non-Binary/Gender-Fluid',
                     'Not listed/Prefer not to answer'],
                    error="If specified, must be one of {labels}. Please use "
                          "the corresponding code from: {choices}."
                )
            )
            parent_email = fields.Email(
                data_key='Parent Email (if under 18)', required=True,
                allow_none=True
            )
            @validates_schema
            def validate_parent_email(self, data,  **kwargs):
                if (data['age'] == '13-17' and
                        data['parent_email'] is None):
                    raise ValidationError(
                        'Parent email must be specified for students under 18')

        try:
            students = BatchRegistrationUserSchema().load(
                students, many=True, unknown=RAISE)
        except ValidationError as err:
            raise PicoException(err.messages, status_code=400)

        # Batch-register accounts
        curr_teacher = api.user.get_user()
        created_accounts = api.group.batch_register(
            students, curr_teacher, group_id)

        if len(created_accounts) != len(students):
            raise PicoException(
                "An error occurred while adding student accounts. " +
                f"The first {len(created_accounts)} were created. " +
                "Please contact an administrator."
            )

        output = []
        for i in range(len(students)):
            output.append({
                'Grade (1-12)': students[i]['current_year'],
                'Age (13-17 or 18+)': students[i]['age'],
                'Gender': students[i]['gender'],
                'Parent Email (if under 18)': students[i]['parent_email'],
                'Username': created_accounts[i]['username'],
                'Password': created_accounts[i]['password']
            })

        buffer = io.StringIO()
        csv_writer = csv.DictWriter(buffer, [
            'Grade (1-12)',
            'Age (13-17 or 18+)',
            'Gender',
            'Parent Email (if under 18)',
            'Username',
            'Password'
        ])
        csv_writer.writeheader()
        csv_writer.writerows(output)
        output_csv_bytes = buffer.getvalue().encode('utf-8')

        return jsonify({
            'success': True,
            'accounts': created_accounts,
            'as_csv': base64.b64encode(output_csv_bytes).decode('utf-8')
        })
Ejemplo n.º 53
0
    def _import(self, message):
        """import <url> [<alias(es)>] - imports all aliases from the given address, or only the listed aliases"""
        if len(message.ParameterList) < 2:
            return IRCResponse(ResponseType.Say,
                               u"You didn't give a url to import from!",
                               message.ReplyTo)

        if len(message.ParameterList) > 2:
            onlyListed = True
            importList = [alias.lower() for alias in message.ParameterList[2:]]
        else:
            onlyListed = False

        url = message.ParameterList[1]
        try:
            page = self.bot.moduleHandler.runActionUntilValue('fetch-url', url)
        except ValueError:
            return IRCResponse(ResponseType.Say,
                               u"'{}' is not a valid URL".format(url),
                               message.ReplyTo)
        if page is None:
            return IRCResponse(ResponseType.Say,
                               u"Failed to open page at {}".format(url),
                               message.ReplyTo)

        text = page.body
        text = UnicodeDammit(text).unicode_markup
        lines = text.splitlines()
        numAliases = 0
        numHelpTexts = 0
        for lineNumber, line in enumerate(lines):
            # Skip over blank lines
            if line == u"":
                continue
            splitLine = line.split()
            if splitLine[0].lower() != u"{}alias".format(self.bot.commandChar):
                return IRCResponse(ResponseType.Say,
                                   u"Line {} at {} does not begin with {}alias".format(lineNumber,
                                                                                       url,
                                                                                       self.bot.commandChar),
                                   message.ReplyTo)
            subCommand = splitLine[1].lower()
            if subCommand not in [u"add", u"help"]:
                return IRCResponse(ResponseType.Say,
                                   u"Line {} at {} is not an add or help command".format(lineNumber, url),
                                   message.ReplyTo)

            aliasName = splitLine[2].lower()
            aliasCommand = splitLine[3:]
            aliasCommand[0] = aliasCommand[0].lower()

            # Skip over aliases that weren't listed, if any were listed
            if onlyListed and aliasName not in importList:
                continue

            if subCommand == u"add":
                self._newAlias(aliasName, u" ".join(aliasCommand))
                numAliases += 1
            elif subCommand == u"help":
                aliasHelp = u" ".join(splitLine[3:])
                self._setAliasHelp(aliasName, aliasHelp)
                numHelpTexts += 1

        self._syncAliases()

        return IRCResponse(ResponseType.Say,
                           u"Imported {} alias(es) and {} help string(s) from {}".format(numAliases,
                                                                                         numHelpTexts,
                                                                                         url),
                           message.ReplyTo)
Ejemplo n.º 54
0
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import csv

url = "http://www.billboard.com/charts/hot-100"
r = requests.get(url)
page_cont = r.content
print(page_cont)
suggestion = UnicodeDammit(page_cont)
suggestion.original_encoding
#suggestion.unicode_markup
page_cont_par = BeautifulSoup(r.content, "html.parser")
containers = page_cont_par.findAll("div", {"class": "chart-row__main-display"})
filename = "musics.csv"
f = open(filename, "w")
headers = "posicao;musica;artista\n"
f.write(headers)
container = containers[0]
for container in containers:
    posicao = container.div.span.text.strip()
    musica = container.h2.text.strip()
    artista = container.a.text.strip()

    print("posicao: " + posicao)
    print("musica: " + musica)
    print("artista: " + artista)
    f.write(posicao + ";" + musica.replace(";", "|") + ";" +
            artista.replace(";", "|") + "\n")

f.close()
Ejemplo n.º 55
0
Archivo: port.py Proyecto: molbal/Pyfa
    def importFitFromFiles(paths, iportuser=None):
        """
        Imports fits from file(s). First processes all provided paths and stores
        assembled fits into a list. This allows us to call back to the GUI as
        fits are processed as well as when fits are being saved.
        returns
        """

        sFit = svcFit.getInstance()

        fit_list = []
        try:
            for path in paths:
                if iportuser:  # Pulse
                    msg = "Processing file:\n%s" % path
                    pyfalog.debug(msg)
                    processing_notify(iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, msg)
                    # wx.CallAfter(callback, 1, msg)

                with open(path, "rb") as file_:
                    srcString = file_.read()
                    dammit = UnicodeDammit(srcString)
                    srcString = dammit.unicode_markup

                if len(srcString) == 0:  # ignore blank files
                    pyfalog.debug("File is blank.")
                    continue

                try:
                    importType, makesNewFits, fitsImport = Port.importAuto(srcString, path, iportuser=iportuser)
                    fit_list += fitsImport
                except xml.parsers.expat.ExpatError:
                    pyfalog.warning("Malformed XML in:\n{0}", path)
                    return False, "Malformed XML in %s" % path

            # IDs = []  # NOTE: what use for IDs?
            numFits = len(fit_list)
            for idx, fit in enumerate(fit_list):
                # Set some more fit attributes and save
                fit.character = sFit.character
                fit.damagePattern = sFit.pattern
                fit.targetProfile = sFit.targetProfile
                if len(fit.implants) > 0:
                    fit.implantLocation = ImplantLocation.FIT
                else:
                    useCharImplants = sFit.serviceFittingOptions["useCharacterImplantsByDefault"]
                    fit.implantLocation = ImplantLocation.CHARACTER if useCharImplants else ImplantLocation.FIT
                db.save(fit)
                # IDs.append(fit.ID)
                if iportuser:  # Pulse
                    pyfalog.debug("Processing complete, saving fits to database: {0}/{1}", idx + 1, numFits)
                    processing_notify(
                        iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE,
                        "Processing complete, saving fits to database\n(%d/%d) %s" % (idx + 1, numFits, fit.ship.name)
                    )

        except UserCancelException:
            return False, "Processing has been canceled.\n"
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            pyfalog.critical("Unknown exception processing: {0}", path)
            pyfalog.critical(e)
            # TypeError: not all arguments converted during string formatting
#                 return False, "Unknown Error while processing {0}" % path
            return False, "Unknown error while processing {}\n\n Error: {} {}".format(
                path, type(e).__name__, getattr(e, 'message', ''))

        return True, fit_list
Ejemplo n.º 56
0
def store_subtitles(file):
    logging.debug('BAZARR started subtitles indexing for this file: ' + file)
    actual_subtitles = []
    if os.path.exists(file):
        # notifications.write(msg='Analyzing this file for subtitles: ' + file, queue='list_subtitles')
        if settings.general.getboolean('use_embedded_subs'):
            logging.debug("BAZARR is trying to index embedded subtitles.")
            try:
                subtitle_languages = embedded_subs_reader.list_languages(file)
                for subtitle_language in subtitle_languages:
                    try:
                        if alpha2_from_alpha3(subtitle_language) is not None:
                            lang = str(alpha2_from_alpha3(subtitle_language))
                            logging.debug(
                                "BAZARR embedded subtitles detected: " + lang)
                            actual_subtitles.append([lang, None])
                    except:
                        logging.debug(
                            "BAZARR unable to index this unrecognized language: "
                            + subtitle_language)
                        pass
            except Exception as e:
                logging.exception(
                    "BAZARR error when trying to analyze this %s file: %s" %
                    (os.path.splitext(file)[1], file))
                pass

        brazilian_portuguese = [".pt-br", ".pob", "pb"]
        try:
            dest_folder = get_subtitle_destination_folder()
            subliminal_patch.core.CUSTOM_PATHS = [dest_folder
                                                  ] if dest_folder else []
            subtitles = search_external_subtitles(
                file,
                languages=get_language_set(),
                only_one=settings.general.getboolean('single_language'))
        except Exception as e:
            logging.exception("BAZARR unable to index external subtitles.")
            pass
        else:
            for subtitle, language in subtitles.iteritems():
                subtitle_path = get_external_subtitles_path(file, subtitle)
                if str(os.path.splitext(subtitle)[0]).lower().endswith(
                        tuple(brazilian_portuguese)):
                    logging.debug("BAZARR external subtitles detected: " +
                                  "pb")
                    actual_subtitles.append(
                        [str("pb"),
                         path_replace_reverse(subtitle_path)])
                elif str(language) != 'und':
                    logging.debug("BAZARR external subtitles detected: " +
                                  str(language))
                    actual_subtitles.append(
                        [str(language),
                         path_replace_reverse(subtitle_path)])
                else:
                    if os.path.splitext(subtitle)[1] != ".sub":
                        logging.debug(
                            "BAZARR falling back to file content analysis to detect language."
                        )
                        with open(
                                path_replace(
                                    os.path.join(os.path.dirname(file),
                                                 subtitle)), 'r') as f:
                            text = list(islice(f, 100))
                            text = ' '.join(text)
                            encoding = UnicodeDammit(text)
                            try:
                                text = text.decode(encoding.original_encoding)
                                detected_language = langdetect.detect(text)
                            except Exception as e:
                                logging.exception(
                                    'BAZARR Error trying to detect language for this subtitles file: '
                                    + path_replace(
                                        os.path.join(os.path.dirname(file),
                                                     subtitle)) +
                                    ' You should try to delete this subtitles file manually and ask Bazarr to download it again.'
                                )
                            else:
                                if len(detected_language) > 0:
                                    logging.debug(
                                        "BAZARR external subtitles detected and analysis guessed this language: "
                                        + str(detected_language))
                                    actual_subtitles.append([
                                        str(detected_language),
                                        path_replace_reverse(
                                            os.path.join(
                                                os.path.dirname(file),
                                                subtitle))
                                    ])

        conn_db = sqlite3.connect(os.path.join(args.config_dir, 'db',
                                               'bazarr.db'),
                                  timeout=30)
        c_db = conn_db.cursor()
        logging.debug("BAZARR storing those languages to DB: " +
                      str(actual_subtitles))
        c_db.execute("UPDATE table_episodes SET subtitles = ? WHERE path = ?",
                     (str(actual_subtitles), path_replace_reverse(file)))
        conn_db.commit()

        c_db.close()
    else:
        logging.debug(
            "BAZARR this file doesn't seems to exist or isn't accessible.")

    logging.debug('BAZARR ended subtitles indexing for this file: ' + file)

    return actual_subtitles
Ejemplo n.º 57
0
def soup_in(filename):
    return BeautifulSoup(
        UnicodeDammit.detwingle(open(filename).read()).decode('utf8'))
Ejemplo n.º 58
0
## MAIN FILE

my_path = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/'
all_files = [f for f in os.listdir(my_path) if os.path.isfile(os.path.join(my_path, f))]
data = {}

#files = ['00006.html', '05111108.html', '120394.html', '1bettyevans.html']
#files = ['05111108.html']


files = all_files[1000:]

for html_file in files:
    with open(os.path.join(my_path, html_file)) as f:
        s = str(f.readlines())
        new_s = UnicodeDammit.detwingle(s)
        new_s = new_s.decode("utf-8")
        soup = BeautifulSoup(new_s, 'html.parser')
        summary = extractSummary(soup)
        names = extractName(soup)

        opath = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/'
        ofile = os.path.join(opath, "output0.txt")
        #printSummaryRows(summary, opath)
#        printPhotoRows(photos, opath)
#        printSkillRows(skills, opath)

        # soup = BeautifulSoup(s, 'html.parser')
        # full_name = soup.find('span', {'class': 'full-name'})
        # summary = soup.find('div', {'class':'summary'})
        # if full_name:
Ejemplo n.º 59
0
 def clean_unicode(comment_str):
     comment_str = comment_str.replace('\n', '').replace('\r', '').strip()
     comment_str = ' '.join(comment_str.split())
     return UnicodeDammit(comment_str).unicode_markup
Ejemplo n.º 60
0
def store_subtitles(file):
    logging.debug('BAZARR started subtitles indexing for this file: ' + file)
    actual_subtitles = []
    if os.path.exists(file):
        if os.path.splitext(file)[1] == '.mkv':
            logging.debug("BAZARR is trying to index embedded subtitles.")
            try:
                with open(file, 'rb') as f:
                    mkv = enzyme.MKV(f)

                for subtitle_track in mkv.subtitle_tracks:
                    try:
                        if alpha2_from_alpha3(subtitle_track.language) != None:
                            lang = str(
                                alpha2_from_alpha3(subtitle_track.language))
                            logging.debug(
                                "BAZARR embedded subtitles detected: " + lang)
                            actual_subtitles.append([lang, None])
                    except:
                        logging.debug(
                            "BAZARR unable to index this unrecognized language: "
                            + subtitle_track.language)
                        pass
            except Exception as e:
                logging.exception(
                    "BAZARR error when trying to analyze this mkv file: " +
                    file)
                pass
        else:
            logging.debug("BAZARR This file isn't an .mkv file.")

        brazilian_portuguese = [".pt-br", ".pob", "pb"]
        try:
            subtitles = core.search_external_subtitles(file)
        except Exception as e:
            logging.exception("BAZARR unable to index external subtitles.")
            pass
        else:
            for subtitle, language in subtitles.iteritems():
                if str(os.path.splitext(subtitle)[0]).lower().endswith(
                        tuple(brazilian_portuguese)) is True:
                    logging.debug("BAZARR external subtitles detected: " +
                                  "pb")
                    actual_subtitles.append([
                        str("pb"),
                        path_replace_reverse(
                            os.path.join(os.path.dirname(file), subtitle))
                    ])
                elif str(language) != 'und':
                    logging.debug("BAZARR external subtitles detected: " +
                                  str(language))
                    actual_subtitles.append([
                        str(language),
                        path_replace_reverse(
                            os.path.join(os.path.dirname(file), subtitle))
                    ])
                else:
                    if os.path.splitext(subtitle)[1] != ".sub":
                        logging.debug(
                            "BAZARR falling back to file content analysis to detect language."
                        )
                        with open(
                                path_replace(
                                    os.path.join(os.path.dirname(file),
                                                 subtitle)), 'r') as f:
                            text = list(islice(f, 100))
                            text = ' '.join(text)
                            encoding = UnicodeDammit(text)
                            try:
                                text = text.decode(encoding.original_encoding)
                                detected_language = langdetect.detect(text)
                            except Exception as e:
                                logging.exception(
                                    'BAZARR Error trying to detect language for this subtitles file: '
                                    + path_replace(
                                        os.path.join(os.path.dirname(file),
                                                     subtitle)) +
                                    ' You should try to delete this subtitles file manually and ask Bazarr to download it again.'
                                )
                            else:
                                if len(detected_language) > 0:
                                    logging.debug(
                                        "BAZARR external subtitles detected and analysis guessed this language: "
                                        + str(detected_language))
                                    actual_subtitles.append([
                                        str(detected_language),
                                        path_replace_reverse(
                                            os.path.join(
                                                os.path.dirname(file),
                                                subtitle))
                                    ])

        conn_db = sqlite3.connect(os.path.join(config_dir, 'db/bazarr.db'),
                                  timeout=30)
        c_db = conn_db.cursor()
        logging.debug("BAZARR storing those languages to DB: " +
                      str(actual_subtitles))
        c_db.execute("UPDATE table_episodes SET subtitles = ? WHERE path = ?",
                     (str(actual_subtitles), path_replace_reverse(file)))
        conn_db.commit()

        c_db.close()
    else:
        logging.debug(
            "BAZARR this file doesn't seems to exist or isn't accessible.")

    logging.debug('BAZARR ended subtitles indexing for this file: ' + file)

    return actual_subtitles