Ejemplo n.º 1
0
def submit(day: int, part: int) -> None:
    if AOC_SESSION_COOKIE is None:
        raise ValueError('Missing AOC_SESSION_COOKIE!')

    part_word = 'one' if part == 1 else 'two'

    solution_module = importlib.import_module(f'{YEAR}.{day:02}.solution')
    answer_func = getattr(solution_module, f'part_{part_word}')
    problem_input = getattr(solution_module, 'parse_data')()

    answer = answer_func(problem_input)

    resp = requests.post(f'https://adventofcode.com/{YEAR}/day/{day}/answer',
                         cookies={'session': AOC_SESSION_COOKIE},
                         data={
                             'level': part,
                             'answer': answer
                         })

    if not resp.ok:
        raise ValueError(f'Bad response from site: {resp.status_code}')

    msg = BeautifulSoup(resp.text, 'html.parser').article.text

    if msg.startswith("That's the") and part == 1:
        webbrowser.open(resp.url)

    print(f'Day {day:02} Part {part:02}: {msg}')
Ejemplo n.º 2
0
def findvideos(item):
    logger.info()

    itemlist = list()

    soup = create_soup(item.url)
    matches = soup.find("ul", id="playeroptionsul")

    for elem in matches.find_all("li"):
        if "youtube" in elem.find("span", class_="server").text:
            continue
        post = {
            "action": "doo_player_ajax",
            "post": elem["data-post"],
            "nume": elem["data-nume"],
            "type": elem["data-type"]
        }
        headers = {"Referer": item.url}
        doo_url = "%swp-admin/admin-ajax.php" % host
        data = httptools.downloadpage(doo_url, post=post, headers=headers).json
        try:
            url = BeautifulSoup(data.get("embed_url", ""),
                                "html5lib").find("iframe")["src"]
        except:
            continue

        if not url.startswith("http"):
            url = "https:%s" % url
        itemlist.append(
            Item(channel=item.channel,
                 title="%s",
                 action="play",
                 url=url,
                 language="LAT",
                 infoLabels=item.infoLabels))

    itemlist = servertools.get_servers_itemlist(
        itemlist, lambda x: x.title % x.server.capitalize())

    # Requerido para FilterTools

    itemlist = filtertools.get_links(itemlist, item, list_language)

    # Requerido para AutoPlay

    autoplay.start(itemlist, item)
    if item.contentType != "episode":
        if config.get_videolibrary_support(
        ) and len(itemlist) > 0 and item.extra != "findvideos":
            itemlist.append(
                Item(
                    channel=item.channel,
                    title=
                    "[COLOR yellow]Añadir esta pelicula a la videoteca[/COLOR]",
                    url=item.url,
                    action="add_pelicula_to_library",
                    extra="findvideos",
                    contentTitle=item.contentTitle))

    return itemlist
Ejemplo n.º 3
0
 def crude_parsing(self):
     crude_list = []
     standard_fields = [
         "from:", "to:", "cc:", "bcc:", "mime-version:", "content-type:",
         "x-from:", "x-to:", "x-cc:", "content-transfer-encoding:",
         "x-bcc:", "x-filename", "subject:", "message-id:", "x-origin:"
     ]
     with open(self.origin_file) as f:
         for line in f:
             line = line.decode("utf-8", "ignore").encode("utf-8").lower()
             try:
                 line = BeautifulSoup(line, "html.parser").getText()
             except Exception as e:
                 line = ""
             line = line.lower()
             if line in ['\n', '\r\n']:
                 crude_list.append("content: " + line.strip())
             else:
                 content = False
                 for field in standard_fields:
                     if line.startswith(field):
                         content = True
                         crude_list.append(line.strip())
                 if not content:
                     if len(crude_list) > 0:
                         crude_list[len(crude_list) -
                                    1] += " " + line.strip()
                     else:
                         crude_list.append("content: " + line.strip())
     return crude_list
Ejemplo n.º 4
0
def check_reference(dsl_lookuper, word, article):
    # Special case for articles in En-En-Longman_DOCE5.dsl
    text = BeautifulSoup(article, 'html.parser').text
    if text.startswith(STR_SEE_MAIN_ENTRY):
        referenced_word = text[len(STR_SEE_MAIN_ENTRY):].strip()
        logging.info('Detected reference from "%s" to "%s" (LongmanDOCE5)',
                     word, referenced_word)
        return lookup_word(dsl_lookuper, referenced_word)

    # Special case for CambridgeAdvancedLearners
    main_entry_start = article.find(STR_MAIN_ENTRY)
    if main_entry_start != -1:
        article_rest = article[main_entry_start + len(STR_MAIN_ENTRY):]
        match = RE_A_HREF.search(article_rest)
        if match:
            referenced_word = match.group(1)
            if referenced_word != word:
                logging.info(
                    'Detected reference from "%s" to "%s" (CambridgeAdvancedLearners)',
                    word, referenced_word)
                more_article, more_examples = lookup_word(
                    dsl_lookuper, referenced_word)
                return article + more_article, more_examples

    # Special case for LingvoUniversal
    if len(text) < SHORT_ARTICLE_LENGTH:
        match = RE_SHORT_REFERENCE.search(text)
        if match:
            referenced_word = match.group(1)
            if word == referenced_word:
                logging.warning(
                    'Self reference from "%s" to "%s", skipping (LingvoUniversal)',
                    word, referenced_word)
            else:
                logging.info(
                    'Detected reference from "%s" to "%s" (LingvoUniversal)',
                    word, referenced_word)
                return lookup_word(dsl_lookuper, referenced_word)

    # Special case for En-En_American_Heritage_Dictionary.dsl
    match = RE_SEE_OTHER.search(text)
    if match:
        referenced_word = match.group(1)
        if referenced_word != word:
            logging.info(
                'Detected reference from "%s" to "%s" (AmericanHeritageDictionary)',
                word, referenced_word)
            return lookup_word(dsl_lookuper, referenced_word)

    return article, None
Ejemplo n.º 5
0
    def downloadJacket(self, filename):
        """Downloads the book's jacket image.

        Args:
            filename (str): The name of the file to save the image to.
        """
        with open('{}.png'.format(filename), 'wb') as jacket:
            image_url = BeautifulSoup(
                requests.get(
                    'https://sfpl.bibliocommons.com/item/show/{}'.format(
                        self._id)).text,
                'lxml').find(class_='jacketCover bib_detail')['src']
            jacket.write(
                requests.get(image_url if image_url.startswith('http') else
                             'https:{}'.format(image_url)).content)
Ejemplo n.º 6
0
def build_header_body(h, b, lineList):
    """
    Merges refering lines into the header.
    """
    if len(lineList) > 0:
        firstLineText = BeautifulSoup(lineList[0]).get_text().strip()

        if firstLineText.startswith(("siehe", "mit siehe", "vgl.")):
            h += lineList[0]
            lineList = lineList[1:]
            h, b = build_header_body(h, b, lineList)

        else:
            b = "<br>".join(lineList)
    return (h, b)
Ejemplo n.º 7
0
def build_header_body(h, b, lineList):
    '''
    Merges refering lines into the header.
    '''
    if len(lineList) > 0:
        firstLineText=BeautifulSoup(lineList[0]).get_text().strip()
        
        if firstLineText.startswith(('siehe', 'mit siehe', 'vgl.')):
            h += lineList[0]
            lineList = lineList[1:]
            h, b = build_header_body(h, b, lineList)
         
        else:
            b = '<br>'.join(lineList)
    return (h, b)
Ejemplo n.º 8
0
def check_reference(dsl_lookuper, word, article):
    # Special case for articles in En-En-Longman_DOCE5.dsl
    text = BeautifulSoup(article, 'html.parser').text
    if text.startswith(STR_SEE_MAIN_ENTRY):
        referenced_word = text[len(STR_SEE_MAIN_ENTRY):].strip()
        logging.info('Detected reference from "%s" to "%s" (LongmanDOCE5)', word, referenced_word)
        return lookup_word(dsl_lookuper, referenced_word)

    # Special case for CambridgeAdvancedLearners
    main_entry_start = article.find(STR_MAIN_ENTRY)
    if main_entry_start != -1:
        article_rest = article[main_entry_start + len(STR_MAIN_ENTRY):]
        match = RE_A_HREF.search(article_rest)
        if match:
            referenced_word = match.group(1)
            if referenced_word != word:
                logging.info('Detected reference from "%s" to "%s" (CambridgeAdvancedLearners)', word, referenced_word)
                more_article, more_examples = lookup_word(dsl_lookuper, referenced_word)
                return article + more_article, more_examples

    # Special case for LingvoUniversal
    if len(text) < SHORT_ARTICLE_LENGTH:
        match = RE_SHORT_REFERENCE.search(text)
        if match:
            referenced_word = match.group(1)
            if word == referenced_word:
                logging.warning('Self reference from "%s" to "%s", skipping (LingvoUniversal)', word, referenced_word)
            else:
                logging.info('Detected reference from "%s" to "%s" (LingvoUniversal)', word, referenced_word)
                return lookup_word(dsl_lookuper, referenced_word)

    # Special case for En-En_American_Heritage_Dictionary.dsl
    match = RE_SEE_OTHER.search(text)
    if match:
        referenced_word = match.group(1)
        if referenced_word != word:
            logging.info('Detected reference from "%s" to "%s" (AmericanHeritageDictionary)', word, referenced_word)
            return lookup_word(dsl_lookuper, referenced_word)

    return article, None
Ejemplo n.º 9
0
    def latex_with_markup(self, string):
        string = BeautifulSoup(string, "lxml").text
        string = string.replace("\\", "\\\\")

        string = re.sub("\{(.*?)\}", r"≾\1≿", string)

        string = regex.sub(
            r"(([\p{IsLatin}\d\:\,\–\-\.]+\s*)+)", r"\\textenglish{\1}", string
        )
        string = string.replace(" }", "} ")
        string = re.sub("\[(.*?)\]", r"\\footnote{\1}", string)
        string = re.sub(r"≾(.*?)≿", r"\\footnote{i.e. \1}", string)
        # string = re.sub('\{(.*?)\}', r'\\footnote{i.e. \1}', string);
        # string = re.sub('\{(.*?)\}', r'\\footnote{i.e. \1}', string)

        # string = string.replace('؟','')
        string = self.latex_punctuation(string)

        if string.startswith("\\footnote"):
            string = "\\textenglish{[See footnote.]}" + string

        return string
Ejemplo n.º 10
0
    def save_Q_A():
        '''将humor_Q_A文件中问题与回答转换格式后写入本地'''
        qa_list = []
        with codecs.open('./data/humor_Q_A_old_version2.txt', encoding='utf-8') as f:
            for line in f.readlines():
                json_line = json.loads(line)
                Q = BeautifulSoup(json_line['Q']).text.strip()
                A_str = BeautifulSoup(json_line['A']).text.strip()
                if A_str.startswith('&lt;'):
                    A_str = A_str.replace('&lt;', '<').replace('&gt;', '>')
                A = 'A:' + '<br>A:'.join(re.split('\n\n+', A_str))
                # print Q
                # print A
                Q_A_str = '<table><tr><td><font color="#4EABF9"><u>%s</u></font><br>%s</td></tr></table>\n' % (Q, A)
                qa_list.append(Q_A_str)

        #发送至邮箱
        # mail_to = "*****@*****.**"
        # send_to_163_mail(''.join(qa_list), mail_to)

        #写入到本地
        codecs.open('Q_A.txt', mode='wb', encoding='utf-8').writelines(qa_list)
Ejemplo n.º 11
0
    def save_Q_A():
        '''将humor_Q_A文件中问题与回答转换格式后写入本地'''
        qa_list = []
        with codecs.open('./data/humor_Q_A_old_version2.txt',
                         encoding='utf-8') as f:
            for line in f.readlines():
                json_line = json.loads(line)
                Q = BeautifulSoup(json_line['Q']).text.strip()
                A_str = BeautifulSoup(json_line['A']).text.strip()
                if A_str.startswith('&lt;'):
                    A_str = A_str.replace('&lt;', '<').replace('&gt;', '>')
                A = 'A:' + '<br>A:'.join(re.split('\n\n+', A_str))
                # print Q
                # print A
                Q_A_str = '<table><tr><td><font color="#4EABF9"><u>%s</u></font><br>%s</td></tr></table>\n' % (
                    Q, A)
                qa_list.append(Q_A_str)

        #发送至邮箱
        # mail_to = "*****@*****.**"
        # send_to_163_mail(''.join(qa_list), mail_to)

        #写入到本地
        codecs.open('Q_A.txt', mode='wb', encoding='utf-8').writelines(qa_list)
Ejemplo n.º 12
0
def play(item):
    logger.info()

    itemlist = list()

    doo_url = "%swp-admin/admin-ajax.php" % host
    data = httptools.downloadpage(doo_url,
                                  post=item.post,
                                  headers={
                                      "referer": item.ref
                                  }).data
    try:
        url = BeautifulSoup(data, "html5lib").find("iframe")["src"]
    except:
        return

    if not url.startswith("http"):
        url = "https:%s" % url

    itemlist.append(item.clone(url=url, server=''))

    itemlist = servertools.get_servers_itemlist(itemlist)

    return itemlist
def sectionalize_8k(db, url, file_name):
    global total_files_processe
    global total_files_passed
    global total_files_failed
    global total_files_no_data
    global total_exibit_passed
    global total_exibit_failed
    global total_exibit_no_data

    #Opening file and reading the content of the file
    fp = open(url, "r", encoding="utf-8")
    webContent = fp.readlines()

    ind = 0

    #items_8K list stores all information about 8-K items
    item_no_list = []  # list of item number
    title_list = []  # list of title
    item_text_list = []  # list of item text

    exhibit_no_list = [
    ]  # exhibit_no list stores all exhibit numbers of the particular form
    exhibit_text_list = []  # description of the corresponding exhibit numbers
    is_exhibit = False
    try:
        while (True):
            #parsing html text from particular index of the html file
            html_line = BeautifulSoup(webContent[ind],
                                      "html.parser").text.strip().lower()

            #exit condition i.e. if text contains "signature", then exit from the loop as we dont require any information
            #beyond the signature text
            if html_line.startswith("signature"):
                break

            #incrementing index to fetch the line of that index from html file in next iteration
            ind += 1

            #below block of statements retrieve the text which is starting from "item" keyword
            if html_line is not "" and html_line.startswith("item"):

                #item_list list stores each item information and their exhibit_no list
                item_list = []

                temp_ind = ind - 1
                item_no = ""  #stores item number
                title = ""  #stores title of the item number
                item_text = ""  #stores text under the item number

                temp = BeautifulSoup(webContent[temp_ind],
                                     "html.parser").text.strip()
                ''' Below code checks for Item Number and its title. First 'If' checks if html line has item no pattern
                    and if it is found, it checks for its descriptin. 'Else checks html line has item no and its
                    description in pattern.
                '''

                #Checks only Item No first, then its title in next html lines
                if re.fullmatch(r'itemÂ*\s\d+\.\d+\.*', temp.lower()):
                    item_no = temp.lower()

                    #Finding the title once item number found
                    for i in range(temp_ind + 1, temp_ind + 10):
                        if (len(webContent) > i):
                            title_html = BeautifulSoup(
                                webContent[i], "html.parser").text.strip()
                            if title_html is not "":
                                title = title_html
                                temp_ind += 1
                else:
                    # Checks for Item No with Title in single html line
                    if re.match(r'itemÂ*\s\d+\.\d+\s*\w+\.*', temp.lower()):
                        temp_split = temp.split()

                        #Finding the Title once item number found
                        for t in temp_split:
                            if re.fullmatch(r'\d+\.\d+\.*', t):
                                item_no = "item " + t
                                t_ind = temp_split.index(t)
                                for i in temp_split[t_ind + 1:]:
                                    title += i + " "

                        temp_ind += 1

                item_no = re.sub('[^a-z0-9.,?!%()$]+', ' ', item_no)
                title = re.sub('[^A-Za-z0-9.,?!%()$]+', ' ', title)
                # print("item no:" + item_no)
                # print("title:" + title)

                # Below code finds the text below the particular item number
                for text in webContent[temp_ind + 1:]:
                    temp1 = BeautifulSoup(text, "html.parser").text.strip()

                    #Exit criteria: If next item found or signature text found, it exits from the loop
                    if (temp1.lower().startswith("item")
                            or temp1.lower().startswith("signature")):
                        break

                    if temp1 == title:
                        continue

                    item_text += temp1

                    #print(temp1)
                    #Checks for the exhibit number if item no is '9.01'.
                    if re.fullmatch(r'item 9.01\.*', item_no) and re.fullmatch(
                            r'\d+\.\d+\.*', temp1):
                        exhibit_no_list.append(temp1)
                    else:
                        #If text contains exhibit number and its description in one line, then split the text and find exhibit number
                        if re.fullmatch(r'item 9.01\.*', item_no) and re.match(
                                r'\d+\.\d+\.*\s*\w+\.*', temp1):
                            temp_split = temp1.split()
                            # Finding the exhibit number
                            for t in temp_split:
                                if re.fullmatch(r'\d+\.\d+\.*', t):
                                    exhibit_no_list.append(t)

                #print(exhibit_no_list)
                item_text = re.sub('[^A-Za-z0-9.,?!%()$]+', ' ', item_text)
                #print("item text:"+ item_text)

                is_exhibit = False
                #Extracting exhibit descriptipn of corresponding exhibit number
                if re.fullmatch(r'item 9.01\.*', item_no) and exhibit_no_list:
                    #Looping through all exhibit numbers
                    for i in range(0, len(exhibit_no_list)):
                        '''
                        Storing length of ith exhibit number and first index of the ith exhibit number to retrieve the
                        description of corresponding exhibit number. The description is captured next index of the exhibit
                        number till the next exhibit number is found. If there is a last element in exhibit number list,
                        then retrieve till the end
                        '''
                        item_length = len(str(exhibit_no_list[i]))
                        current_index = item_text.index(str(
                            exhibit_no_list[i]))
                        if i == len(exhibit_no_list) - 1:
                            exhibit_text_list.append(item_text[current_index +
                                                               item_length:])
                        else:
                            next_index = item_text.index(
                                str(exhibit_no_list[i + 1]))
                            exhibit_text_list.append(
                                item_text[current_index +
                                          item_length:next_index])

                        is_exhibit = True

                #Storing item_no, title and text in item_list and exhibit number in exhibit_list.
                #Then store all information in items_8K list.
                if item_no and title and item_text:
                    item_no_list.append(item_no)
                    title_list.append(title)
                    item_text_list.append(item_text)

        #Storing 8-k Form details into MongoDB
        if item_no_list and title_list and item_text_list:
            #Calculating if exhibit list is empty, then add 1 to total exhibit failed, else add 1 to total exhibit passed.
            if exhibit_no_list:
                total_exibit_passed += 1
            else:
                total_exibit_no_data += 1

            insertData(db, file_name, item_no_list, title_list, item_text_list,
                       exhibit_no_list, exhibit_text_list)
            total_files_passed += 1
        else:
            total_files_no_data += 1
        log_print_statments("Processed in " + str(url))

    except Exception as ex:
        total_files_failed += 1
        if is_exhibit is False:
            total_exibit_failed += 1
        log_print_statments("Exception in " + str(url))
Ejemplo n.º 14
0
 def parse(self, response):
     """
     Scrapes the list of modules associated with Bottle. Causes
     scrapy to follow the links to the module docs and uses a different
     parser to extract the API information contained therein.
     """
     # Find all the function definitions on the page:
     for func in response.css('dl.function'):
         # Class details are always first items in dl.
         func_spec = func.css('dt')[0]
         func_doc = func.css('dd')[0]
         # Function name is always first dt
         func_name = BeautifulSoup(func_spec.css('code.descname').\
             extract()[0], 'html.parser').text
         # Args into function
         args = []
         for ems in func_spec.css('em'):
             args.append(ems.extract().replace('<em>', '').\
                 replace('</em>', ''))
         # Function description.
         soup = BeautifulSoup(func_doc.extract(), 'html.parser')
         d = self.to_dict(func_name, args, soup.text)
         if d:
             yield d
     # Find all the class definitions on the page:
     for classes in response.css('dl.class'):
         # Class details are always first items in dl.
         class_spec = classes.css('dt')[0]
         class_doc = classes.css('dd')[0]
         # Class name is always first dt
         class_name = BeautifulSoup(class_spec.css('code.descname').\
             extract()[0], 'html.parser').text
         # Args into __init__
         init_args = []
         for ems in class_spec.css('em'):
             props = 'property' in ems.css('::attr(class)').extract()
             if not props:
                 init_args.append(ems.extract().replace('<em>', '').\
                     replace('</em>', ''))
         # Class description. Everything up to and including the field-list.
         soup = BeautifulSoup(class_doc.extract(), 'html.parser')
         contents = soup.contents[0].contents
         description = ''
         for child in contents:
             if child.name == 'p':
                 description += child.text + '\n\n'
             if child.name == 'table':
                 raw = child.text
                 rows = [r.strip() for r in raw.split('/n') if r.strip()]
                 description += '\n'
                 description += '\n'.join(rows)
                 break
             if child.name == 'dl':
                 break
         d = self.to_dict(class_name, init_args, description)
         if d:
             yield d
         # Remaining dt are methods or attributes
         for methods in classes.css('dl.method'):
             # Parse and yield methods.
             method_name = BeautifulSoup(methods.css('code.descname').\
                 extract()[0], 'html.parser').text
             if method_name.startswith('__'):
                 break
             method_name = class_name + '.' + method_name
             method_args = []
             for ems in methods.css('em'):
                 method_args.append(ems.extract().replace('<em>', '').\
                     replace('</em>', ''))
             description = BeautifulSoup(methods.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(method_name, method_args, description)
             if d:
                 yield d
         for data in classes.css('dl.attribute'):
             name = BeautifulSoup(data.css('code.descname').extract()[0],
                                  'html.parser').text
             name = class_name + '.' + name
             description = BeautifulSoup(data.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
         for data in classes.css('dl.data'):
             name = BeautifulSoup(data.css('code.descname').extract()[0],
                                  'html.parser').text
             name = class_name + '.' + name
             description = BeautifulSoup(data.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
Ejemplo n.º 15
0
def getRound(soup, id, gameId, round):
    div = soup.find('div', id=id)
    if div:
        categories = div.find_all('td', class_="category_name")
        # f = open("soup.txt","w")
        # f.write(soup.prettify())
        # f.close()
        if len(categories) == 6:
            categories = list(map(getText, categories))
            clueDivs = div.find_all('td', class_='clue')
        else:
            categories = soup.find_all('td', class_="category_name")
            clueDivs = soup.find_all('td', class_='clue')
            if round == 'Single':
                categories = categories[:6]
                clueDivs = clueDivs[:6]
            elif len(categories) >= 12:
                categories = categories[6:12]
                clueDivs = clueDivs[6:12]
            else:
                categories = []
                clueDivs = []
            categories = list(map(getText, categories))

        if len(categories) > 0:
            clues = [[0 for x in range(6)] for y in range(5)]
            answers = [[0 for x in range(6)] for y in range(5)]
            extract = re.compile('correct_response&quot;&gt;(.*)&lt;/em&gt;')

            row = 0
            col = 0
            numClues = 0

            for square in clueDivs:
                text = square.find('td', class_='clue_text')
                if text:
                    clues[row][col] = text.text
                    numClues += 1
                answerDiv = square.find('div')
                if answerDiv:
                    answer = extract.search(str(answerDiv))
                    pretty = BeautifulSoup(answer.group(1), 'html.parser').text
                    if pretty.startswith('<i>'):
                        pretty = pretty[3:]
                    if pretty.endswith('</i>'):
                        pretty = pretty[:-4]
                    answers[row][col] = pretty
                col += 1
                if col == 6:
                    col = 0
                    row += 1

            for col in range(6):
                sql = "INSERT INTO Categories (GameId, RoundCode, Name) VALUES (%s, %s, %s)"
                val = (gameId, round, categories[col])
                mycursor.execute(sql, val)
                mydb.commit()
                categoryId = mycursor.lastrowid

                sql = "Insert Into Clues (Categoryid, PointVal, Clue, Answer) Values (%s, %s, %s, %s)"
                val = []
                for row in range(5):
                    val.append((categoryId, row * 200 + 200, clues[row][col],
                                answers[row][col]))
                mycursor.executemany(sql, val)
                mydb.commit()

            print('\t', round, ': ', numClues)
        else:
            print('\t', round, ': no clues')
        file_as_string = unicode(html_file.read(), errors='ignore')
        html_file.close()

        raw_a_tag_values = []
        raw_a_tag_values.extend(re.findall(r"(\<a.*?\>)", file_as_string,  re.IGNORECASE | re.MULTILINE | re.DOTALL))

        if len(raw_a_tag_values) > 0 :
            print "Operating on " + subdir_string + '/' + cur_tree_location + '/' + cur_tree_file + ":\n"
            for cur_raw_a_tag_value in raw_a_tag_values :
                cur_raw_a_tag_value_orig = cur_raw_a_tag_value
                cur_raw_a_tag_value = cur_raw_a_tag_value.replace("\r", " ").replace("\n", " ")

                if 'href' in cur_raw_a_tag_value and not '<?' in cur_raw_a_tag_value and not ' $' in cur_raw_a_tag_value and not cur_raw_a_tag_value.count('\\') > 3 and not 'file://' in cur_raw_a_tag_value and not '<area' in cur_raw_a_tag_value :
                    cur_a_href_value = BeautifulSoup(cur_raw_a_tag_value).a['href']
                    if cur_a_href_value.lower().endswith(media_bins_suffixes):
                        if not cur_a_href_value.startswith(('http', '//')) or cur_a_href_value.startswith(on_eclipse_uri_prefixes):
                            print "Replacing " + cur_raw_a_tag_value
                            if options.auto_process is True:
                                new_filestring = media_server_url + guess_new_imagepath(
                                    cur_a_href_value,
                                    media_server_url,
                                    subdir_string + cur_tree_location
                                )
                            else:
                                new_filestring = read_input_prefill(
                                    'New img src (Enter nothing to skip) : ',
                                    media_server_url + guess_new_imagepath(
                                        cur_a_href_value,
                                        media_server_url,
                                        subdir_string + cur_tree_location
                                    )
Ejemplo n.º 17
0
 def parse_api(self, response):
     """
     Parses a *potential* API documentation page.
     """
     # Find all the function definitions on the page:
     for func in response.css('dl.function'):
         # Class details are always first items in dl.
         func_spec = func.css('dt')[0]
         func_doc = func.css('dd')[0]
         # Function name is always first dt
         fn1 = BeautifulSoup(func_spec.css('code.descclassname').\
             extract()[0], 'html.parser').text
         fn2 = BeautifulSoup(func_spec.css('code.descname').extract()[0],
                             'html.parser').text
         func_name = fn1 + fn2
         # Args into function
         args = []
         for ems in func_spec.css('em'):
             args.append(ems.extract().replace('<em>', '').\
                 replace('</em>', ''))
         # Function description.
         soup = BeautifulSoup(func_doc.extract(), 'html.parser')
         d = self.to_dict(func_name, args, soup.text)
         if d:
             yield d
     # Find all the class definitions on the page:
     for classes in response.css('dl.class'):
         # Class details are always first items in dl.
         class_spec = classes.css('dt')[0]
         class_doc = classes.css('dd')[0]
         # Class name is always first dt
         cn1 = BeautifulSoup(class_spec.css('code.descclassname').\
             extract()[0], 'html.parser').text
         cn2 = BeautifulSoup(class_spec.css('code.descname').extract()[0],
                             'html.parser').text
         class_name = cn1 + cn2
         # Args into __init__
         init_args = []
         for ems in class_spec.css('em'):
             props = 'property' in ems.css('::attr(class)').extract()
             if not props:
                 init_args.append(ems.extract().replace('<em>', '').\
                     replace('</em>', ''))
         # Class description. Everything up to and including the field-list.
         soup = BeautifulSoup(class_doc.extract(), 'html.parser')
         contents = soup.contents[0].contents
         description = ''
         for child in contents:
             if child.name == 'p':
                 description += child.text + '\n\n'
             if child.name == 'table':
                 raw = child.text
                 rows = [r.strip() for r in raw.split('/n') if r.strip()]
                 description += '\n'
                 description += '\n'.join(rows)
                 break
             if child.name == 'dl':
                 break
         d = self.to_dict(class_name, init_args, description)
         if d:
             yield d
         # Remaining dt are methods or attributes
         for methods in classes.css('dl.method'):
             # Parse and yield methods.
             method_name = BeautifulSoup(methods.css('code.descname').\
                 extract()[0], 'html.parser').text
             if method_name.startswith('__'):
                 break
             method_name = class_name + '.' + method_name
             method_args = []
             for ems in methods.css('em'):
                 method_args.append(ems.extract().replace('<em>', '').\
                     replace('</em>', ''))
             description = BeautifulSoup(methods.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(method_name, method_args, description)
             if d:
                 yield d
         for data in classes.css('dl.attribute'):
             name = BeautifulSoup(data.css('code.descname').extract()[0],
                                  'html.parser').text
             name = class_name + '.' + name
             description = BeautifulSoup(data.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
         for data in classes.css('dl.data'):
             name = BeautifulSoup(data.css('code.descname').extract()[0],
                                  'html.parser').text
             name = class_name + '.' + name
             description = BeautifulSoup(data.css('dd')[0].extract(),
                                         'html.parser').text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
Ejemplo n.º 18
0
def descargarCategoriaEspecifica22(URLLL, resultados):
	resultado = descargarResultado("/producto/" +  URLLL , 360, 10);


	try: 
		codigo = URLLL
	except:
		codigo = ''

	try: 
		nombre = resultado.split('<h2 class="with-tabs">')[1].split('</h2>')[0].replace("\\t",'').strip()
	except:
		nombre = ''
	
	try:	
		categoria = resultado.split('<b>Categor')[1].split('</div>')[0].split('</b>')[1].replace("\\t",'').replace("\\n",'').strip()
	except:
		categoria = ''

	try:	
		costo = resultado.split('class="uc-price">')[2].split('<')[0].replace("\\t",'').strip()
	except:
		costo = ''

	try:
		fotos = 'http://www.radec.com.mx/sites/all/files/productos/' + codigo + '.jpg';
	except:
		fotos = ''

	val = 0;

	nombre2 = nombre

	try:
		for car in resultado.split("/sites/all/themes/radec/images/car_icon.gif"):


			marca = ''
			marca_auto = ''
			modelo = ''
			anio = ''
			notas = ''

			if (val == 0):
				val = 1;
			else:
				try:
					marca_auto = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[2].strip()
				except:
					marca_auto = ''


				try:
					marca = ''

					if (' TYC ' in nombre):
						marca = 'TYC'
					
					if ( ' DEPO ' in nombre):
						marca = 'DEPO' 
				except:
					marca = ''

				try:				
					modelo = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[3].strip()
				except:
					modelo = ''



				anio = car.split('<')[0].split('>')[1].replace("\\t",'').split('\\n')[5].strip()


				if (anio != 'ALL YEARS'):
					anioOrigin2 = '#'+anio;
					anioOrigin = anioOrigin2.replace('#20','').replace('#19','').replace('-20','-').replace('-19','-')
					anioList = [];
	

					if ('-' in anio):
						
						anioInicio = int(anio.split('-')[0])
						anioFin = int(anio.split('-')[1] )

						while (anioInicio <= anioFin):
							anioList.append(str(anioInicio))
							anioInicio = anioInicio + 1;
	
						anio = ' '.join(anioList) + ' '
					
				
					if (len(anioList) < 5):
						nombre = nombre.replace(anioOrigin,anio);
					else:
						nombre = nombre.replace(anioOrigin,anioOrigin2.replace('#','').replace('-',' a '));

		

				try:	
					notas = resultado.split('<b>Aplicaciones:</b>')[1].split('</div>')[0].replace("\\t",'').replace("\\n",'').replace('<br/>',' - ')

					notas = BeautifulSoup(notas, 'html.parser').text;

					while ("  " in notas):
						notas = notas.replace('  ',' ');

					if (notas.startswith(' - ')):
						notas = notas.replace(" - ", "", 1)

					if (notas.endswith(' - ')):
						notas = rreplace(notas," - ", "", 1);

				except:
					notas = ''



				nombre= nombre.replace(' FD ', ' FORD ').replace(' CV ', ' CHEVROLET ').replace(' TY ', ' TOYOTA ').replace(' AD ', ' AUDI ').replace(' BK ', ' BUICK ').replace(' MC ', ' MERCEDES BENZ ').replace(' ST ', ' SEAT ').replace(' VW ', ' VOLKSWAGEN ').replace(' KI ', ' KIA ').replace(' NS ', ' NISSAN ').replace(' HD ', ' HONDA ').replace(' SN ',' SATURN ').replace(' JP ', ' JEEP ').replace(' AC ', ' ACURA ').replace(' DG ', ' DODGE ').replace(' PT ',' PONTIAC ').replace(' BW ', ' BMW ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ').replace(' UNIV ', ' UNIVERSAL ').replace(' CR ', ' CHRYSLER ').replace(' MT ', ' MITSUBISHI ').replace(' PG ',' PEUGEOT ')
				nombre= nombre.replace(' JGO ', ' JUEGO ').replace(' CD ', ' CADILLAC ')

		resultados.append('"'+codigo+'","'+nombre +'","'+ marca +'","'+ marca_auto +'","'+ categoria +'","'+costo +'","' + modelo +'","'+ fotos+'","'+ anio +'","'+ notas +'"');
	except Exception as e:
		print('FALLO ---- > ' + URLLL)
		resultados.append('"'+URLLL+'"');
	
	return;
Ejemplo n.º 19
0
sjClues = [[0 for x in range(6)] for y in range(5)]
djClues = [[0 for x in range(6)] for y in range(5)]
sjAnswers = [[0 for x in range(6)] for y in range(5)]
djAnswers = [[0 for x in range(6)] for y in range(5)]
row = 0
col = 0

for square in sjDivs:
    text = square.find('td', class_='clue_text')
    if text:
        sjClues[row][col] = text.text
    answerDiv = square.find('div')
    if answerDiv:
        answer = extract.search(str(answerDiv))
        pretty = BeautifulSoup(answer.group(1), 'html.parser').text
        if pretty.startswith('<i>'):
            pretty = pretty[3:]
        if pretty.endswith('</i>'):
            pretty = pretty[:-4]
        sjAnswers[row][col] = pretty
    col += 1
    if col == 6:
        col = 0
        row += 1

row = 0
col = 0
for square in djDivs:
    text = square.find('td', class_='clue_text')
    if text:
        djClues[row][col] = text.text
Ejemplo n.º 20
0
 def parse_api(self, response):
     """
     Parses a *potential* API documentation page.
     """
     # Find all the function definitions on the page:
     for func in response.css("dl.function"):
         # Class details are always first items in dl.
         func_spec = func.css("dt")[0]
         func_doc = func.css("dd")[0]
         # Function name is always first dt
         fn1 = BeautifulSoup(
             func_spec.css("code.descclassname").extract()[0], "html.parser"
         ).text
         fn2 = BeautifulSoup(
             func_spec.css("code.descname").extract()[0], "html.parser"
         ).text
         func_name = fn1 + fn2
         # Args into function
         args = []
         for ems in func_spec.css("em"):
             args.append(
                 ems.extract().replace("<em>", "").replace("</em>", "")
             )
         # Function description.
         soup = BeautifulSoup(func_doc.extract(), "html.parser")
         d = self.to_dict(func_name, args, soup.text)
         if d:
             yield d
     # Find all the class definitions on the page:
     for classes in response.css("dl.class"):
         # Class details are always first items in dl.
         class_spec = classes.css("dt")[0]
         class_doc = classes.css("dd")[0]
         # Class name is always first dt
         cn1 = BeautifulSoup(
             class_spec.css("code.descclassname").extract()[0],
             "html.parser",
         ).text
         cn2 = BeautifulSoup(
             class_spec.css("code.descname").extract()[0], "html.parser"
         ).text
         class_name = cn1 + cn2
         # Args into __init__
         init_args = []
         for ems in class_spec.css("em"):
             props = "property" in ems.css("::attr(class)").extract()
             if not props:
                 init_args.append(
                     ems.extract().replace("<em>", "").replace("</em>", "")
                 )
         # Class description. Everything up to and including the field-list.
         soup = BeautifulSoup(class_doc.extract(), "html.parser")
         contents = soup.contents[0].contents
         description = ""
         for child in contents:
             if child.name == "p":
                 description += child.text + "\n\n"
             if child.name == "table":
                 raw = child.text
                 rows = [r.strip() for r in raw.split("/n") if r.strip()]
                 description += "\n"
                 description += "\n".join(rows)
                 break
             if child.name == "dl":
                 break
         d = self.to_dict(class_name, init_args, description)
         if d:
             yield d
         # Remaining dt are methods or attributes
         for methods in classes.css("dl.method"):
             # Parse and yield methods.
             method_name = BeautifulSoup(
                 methods.css("code.descname").extract()[0], "html.parser"
             ).text
             if method_name.startswith("__"):
                 break
             method_name = class_name + "." + method_name
             method_args = []
             for ems in methods.css("em"):
                 method_args.append(
                     ems.extract().replace("<em>", "").replace("</em>", "")
                 )
             description = BeautifulSoup(
                 methods.css("dd")[0].extract(), "html.parser"
             ).text
             d = self.to_dict(method_name, method_args, description)
             if d:
                 yield d
         for data in classes.css("dl.attribute"):
             name = BeautifulSoup(
                 data.css("code.descname").extract()[0], "html.parser"
             ).text
             name = class_name + "." + name
             description = BeautifulSoup(
                 data.css("dd")[0].extract(), "html.parser"
             ).text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
         for data in classes.css("dl.data"):
             name = BeautifulSoup(
                 data.css("code.descname").extract()[0], "html.parser"
             ).text
             name = class_name + "." + name
             description = BeautifulSoup(
                 data.css("dd")[0].extract(), "html.parser"
             ).text
             d = self.to_dict(name, None, description)
             if d:
                 yield d
Ejemplo n.º 21
0
def findvideos(item):
    logger.info()

    itemlist = list()
    sub = ""
    soup = create_soup(item.url)
    matches = soup.find("div", class_="navEP2")
    if not matches:
        return itemlist

    for elem in matches.find_all("li", class_="dooplay_player_option"):

        post = {
            "action": "doo_player_ajax",
            "post": elem["data-post"],
            "nume": elem["data-nume"],
            "type": elem["data-type"]
        }
        headers = {"Referer": item.url}
        doo_url = "%swp-admin/admin-ajax.php" % host

        data = httptools.downloadpage(doo_url, post=post, headers=headers).data

        if not data:
            continue
        player_url = BeautifulSoup(data, "html5lib").find("iframe")["src"]

        player_url = player_url.replace("https://animekao.club/video/",
                                        "https://kaocentro.net/video/")
        if not player_url.startswith(
                "https://re.") and not player_url.startswith(
                    "https://kaocentro.net/video/"):
            url = process_url(player_url)
            if not url:
                continue
            itemlist.append(
                Item(channel=item.channel,
                     title='%s',
                     action='play',
                     url=url,
                     language="LAT",
                     infoLabels=item.infoLabels,
                     subtitle=sub))
        else:
            player = httptools.downloadpage(player_url,
                                            headers={
                                                "referer": item.url
                                            }).data
            soup = BeautifulSoup(player, "html5lib")
            if soup.find("div", id="ErrorWin"):
                continue
            matches = soup.find_all("li", {"onclick": True})

            lang_data = soup.find("li", class_="SLD_A")
            if lang_data.has_attr("data-lang"):
                lang = lang_data.get("data-lang", "2")
            else:
                lang = scrapertools.find_single_match(
                    lang_data.get("onclick", ""), "this, '([^']+)'")

            for elem in matches:
                if not elem.has_attr("data-r"):
                    url = scrapertools.find_single_match(
                        elem.get("onclick", ""), "go_to_player\('([^']+)")
                else:
                    url = base64.b64decode(elem["data-r"]).decode('utf-8')
                if not url or "short." in url:
                    continue
                url = process_url(url)
                if not url:
                    continue
                itemlist.append(
                    Item(channel=item.channel,
                         title='%s',
                         action='play',
                         url=url,
                         language=IDIOMAS.get(lang, "VOSE"),
                         infoLabels=item.infoLabels,
                         subtitle=sub))
    itemlist = servertools.get_servers_itemlist(
        itemlist, lambda x: x.title % x.server.capitalize())

    # Requerido para FilterTools

    itemlist = filtertools.get_links(itemlist, item, list_language)

    # Requerido para AutoPlay

    autoplay.start(itemlist, item)

    if config.get_videolibrary_support(
    ) and len(itemlist) > 0 and item.extra != 'findvideos':
        itemlist.append(
            Item(channel=item.channel,
                 title=
                 '[COLOR yellow]Añadir esta pelicula a la videoteca[/COLOR]',
                 url=item.url,
                 action="add_pelicula_to_library",
                 extra="findvideos",
                 contentTitle=item.contentTitle))

    return itemlist
Ejemplo n.º 22
0
    'GET',
    'scheme':
    'https',
    'accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Connection':
    'Keep-Alive',
    'cache-control':
    'max-age=0',
    'Range':
    'bytes=0-100000',
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
url_htm = s.get(url, headers=headers)
soup = BeautifulSoup(url_htm.text, "html.parser")
print("hello.")

for line in soup.findAll('div', {'class:', 'e_col w4_5'}):
    soup2 = BeautifulSoup(str(line))
    for line2 in soup2.findAll('div', {'class:', 'title '}):
        q_url = BeautifulSoup(str(line2)).find('a').get('href')
        print(q_url)
        if q_url.startswith('/unanswered'):
            continue
        else:
            questionslinks.append('https://www.quora.com' + q_url)

with open('questionslinks.txt', 'w', encoding='utf-8') as f:
    for i in range(len(questionslinks)):
        f.write(questionslinks[i] + '\n')
Ejemplo n.º 23
0
		for line in infile:			
			if '![' in line:
				line = re.sub(r'(?!\[.*)\]\((?!http)', '](img/', line)
			else:
				for src, target in replacements.items():
					line = line.replace(src, target)
			
			# if 'youtube.com/embed/' in line:
			# 	youtubeString = re.findall(r'(www\.youtube\.com/embed/\S*)"', line)
			# 	youtubeStringStringed = youtubeString[0]
			# 	youtubeStringFixed = 'https//' + youtubeStringStringed
			# 	youtubeStringFixed = re.sub(r'embed/', 'watch?v=', youtubeStringFixed)
			# 	wholeLine = '{% youtube %}' + youtubeStringFixed + '{% endyoutube %}'
			# 	line = '\n' + wholeLine + '\n'
			
			if line.startswith('Obra publicada con'):
				line = line.replace(line, '')

			if '![' in line:
				line = '\n' + line + '\n'
				
			outfile.write(line)
	infile.close()
	outfile.close()
	os.remove(file)
	os.rename(tmp_file, file_name)

pasado_a_repo = 'No'
while pasado_a_repo != 'Sí':
		pasado_a_repo = input('¿Has creado el repo en github? ')