Example #1
0
def post(request, post_id):
    post_id = post_id[-14:]
    post = Post.objects.get(id=post_id)
    manifest = os.path.join(RAW_DIR, post.dir_name, 'manifest.json')
    with open(manifest, 'r') as mf:
        manifest = json.load(mf)
    try:
        page_type = manifest['page_type']
    except:
        pass
    if page_type == 'Typora':
        with open(
                os.path.join(RAW_DIR, post.dir_name, 'templates',
                             'index.html'), 'r') as html:
            html = BeautifulSoup(html.read(), 'lxml')
        html = str(html.body)
        html = html.replace('body', 'div')
        return render(request, 'post/index.html', {'html': html, 'post': post})
    else:
        context = {}
        scripts = manifest['data']['script']
        if scripts:
            f_script = os.path.join(RAW_DIR, post.dir_name, 'static',
                                    scripts[0])
            f_content = os.path.join(RAW_DIR, post.dir_name, 'static',
                                     'content.xml')
            import importlib
            spec = importlib.util.spec_from_file_location('script', f_script)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            context = module.get_context(f_content)
        context['post'] = post
        return render(request,
                      os.path.join(post.dir_name, 'templates', 'index.html'),
                      context)
Example #2
0
def scanPage(br: Browser, uri_materials: str, saveto: str,
             fileext_whitelist: List[str], pythomat: Pythomat, section: str,
             overwrite: int):
    soup = br.open(uri_materials)
    soup = BeautifulSoup(soup.read(), "html.parser")

    os.chdir(saveto)

    icons = soup.select(".activityinstance .activityicon")
    for icon in icons:
        ressourceClassification = classifyRessource(icon.get("src"),
                                                    fileext_whitelist)

        if ressourceClassification is None:
            print(
                "[Ignored] Since its icon could not be classified: {}".format(
                    icon.get("src")))
        elif ressourceClassification[1] == 1:  # Download
            filelink_dom = icon.parent
            downloadpath = filelink_dom.get("href")
            downloadFromRawUrl(downloadpath, pythomat, section, br,
                               fileext_whitelist, overwrite, saveto)
        elif ressourceClassification[1] == 2:  # Folder
            filelink_dom = icon.parent
            downloadpath = filelink_dom.get("href")
            scanSubPage(br, downloadpath, saveto, fileext_whitelist, pythomat,
                        section, overwrite)
        elif ressourceClassification[1] == 3:  # Folder
            filelink_dom = icon.parent
            downloadpath = filelink_dom.get("href")
            scanAssignmentPage(br, downloadpath, saveto, fileext_whitelist,
                               pythomat, section, overwrite)
        else:  # Don't download | ressourceClassification[1] == 0:
            print("[Ignored] Since its icon is not whitelisted: {}".format(
                icon.get("src")))
Example #3
0
def fourchan(s, request, channel):
    try:
        global fCounter
        board = "/" + request + "/"
        page = urllib2.Request("http://4chan.org" + board,
                               headers={'User-agent': 'Mozilla/5.0'})
        request = urllib2.urlopen(page)
        request = BeautifulSoup(request.read())
        details = request.find_all("div", {"class": "thread"})
        global fourchan_topics
        fourchan_topics = details
        name = details[0].find_all("span", {"class": "name"})
        send(s, "PRIVMSG %s :%s" % (channel, name[0].text))
        date = details[0].find_all("span", {"class": "dateTime"})
        send(s, "PRIVMSG %s :%s" % (channel, date[0].text))
        url = details[0].a['href']
        send(
            s, "PRIVMSG %s :%s" %
            (channel, "http://boards.4chan.org" + board + url))
        problem_text = details[0].find_all("blockquote",
                                           {"class": "postMessage"})
        send(s, "PRIVMSG %s :%s" % (channel, problem_text[0].text))
        fCounter = 0
        global current_board
        current_board = board
    except urllib2.HTTPError:
        send(s, "PRIVMSG %s :Throttled by tor." % channel)
def printPage(soup, name, t=False):
    if (not t):
        soup = BeautifulSoup(soup.read(), 'lxml')
    if not os.path.exists('LogFiles/ErrorFiles/'):
        os.makedirs('LogFiles/ErrorFiles')
    misc = open("LogFiles/ErrorFiles/" + name + ".html", "w")
    print(soup.prettify(), file=misc)
    misc.close()
Example #5
0
def attrs_p():
    '''
        <tr class>
            <td class="count"></td>
            <td>111</td>
            <td>222</td>
        </tr>
        <tr class="odd"></tr>
        '''
    soup = BeautifulSoup()
    soup.read()
    # 匹配带有class属性的tr标签
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')  #在每个tr标签下,查找所有的td标签
        print tdlist[1].string  #这里提取IP值
        print tdlist[2].string  #这里提取端口值
Example #6
0
def openUrl(url):  #获取url信息
    try:
        html = urlopen(url)
    except (HTTPError, URLError) as e:
        return None
    try:
        html = BeautifulSoup(html.read(), "html.parser")
    except AttributeError as e:
        return None
    return html
Example #7
0
    def faviconUrl(self, url):
        try:
            with urlopen(url) as html:
                html = BeautifulSoup(html.read())

                if not html.find(rel="shortcut icon") is None:
                    favicon_url = html.find(rel="shortcut icon")["href"]
                elif not html.find(rel="icon")["href"] is None:
                    favicon_url = html.find(rel="icon")["href"]
                return urljoin(url, favicon_url)
        except:
            return None
Example #8
0
 def getPage(self, pageNum):
     try:
         url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
         request = Request(url)
         html = urlopen(request)
         bsobj = BeautifulSoup(html.read(), "html.parser")
         # print(bsobj)
         return bsobj.read()
     except URLError as e:
         if hasattr(e, "reason"):
             print(u"链接百度贴吧失败,错误原因", e.reason)
             return None
Example #9
0
def SearchLinks(url_list):
    url_beginning = 'http://' + craigslist_region +'.craigslist.org/'
    for item in url_list:
        post_url = url_beginning + item
        post_html = urllib2.urlopen(post_url)
        post_html = BeautifulSoup(post_html.read())
        meta_tags = post_html.find('meta')
        post_parent = meta_tags.parent
        post_parent = post_parent.contents
        
        post_body = post_parent[7]
        keywords = ['mario','ps3','ps4','xbox','gameboy','linux','sega','brewing'] #insert your own!
        for item in keywords:
            if str(post_body).find(item) is not -1:
                send_text(post_url)
Example #10
0
def get_data(data, name, pattern, iscomments=False, istags=False):
    if istags:
        result = ", ".join(
            [tag.text.strip() for tag in data.find_all(**pattern)])
    elif iscomments:
        if not data.find(**pattern["counter"]):
            return ""
        else:
            postlink = data.find(**pattern["postlink"])["href"]
            with req.urlopen(postlink) as comments_data:
                comments_data = BeautifulSoup(comments_data.read(), "lxml")
                return get_data(comments_data, name, pattern["comment"])
    else:
        data = data.find(**pattern)
        if data:
            result = data.text.strip()
        else:
            return None
    return "{}: {}".format(name, result)
Example #11
0
def scanSubPage(br: Browser, url: str, saveto: str,
                fileext_whitelist: List[str], pythomat: Pythomat, section: str,
                overwrite: int):
    soup = br.open(url)
    soup = BeautifulSoup(soup.read(), "html.parser")

    icons = soup.select(".fp-filename-icon .icon")
    for icon in icons:
        ressourceClassification = classifyRessource(icon.get("src"),
                                                    fileext_whitelist)

        if ressourceClassification is None:
            print(
                "[Ignored] Since its icon could not be classified: {}".format(
                    icon.get("src")))
        elif ressourceClassification[1] == 1:  # Download
            filelink_dom = icon.parent.parent
            downloadpath = filelink_dom.get("href")
            downloadFromRawUrl(downloadpath, pythomat, section, br,
                               fileext_whitelist, overwrite, saveto)
        else:  # Don't download | ressourceClassification[1] == 0:
            print("[Ignored] Since its icon is not whitelisted: {}".format(
                icon.get("src")))
Example #12
0
def fourchan(s, request, channel):
	try:
		global fCounter
		board = "/"+request+"/"
		page = urllib2.Request("http://4chan.org"+ board, headers={'User-agent' : 'Mozilla/5.0'})
		request = urllib2.urlopen(page)
		request = BeautifulSoup(request.read())
		details = request.find_all("div", {"class" : "thread"})
		global fourchan_topics
		fourchan_topics = details
		name = details[0].find_all("span", {"class" : "name"})
		send(s, "PRIVMSG %s :%s" % (channel, name[0].text))
		date = details[0].find_all("span", {"class" : "dateTime"})
		send(s, "PRIVMSG %s :%s" % (channel, date[0].text))
		url = details[0].a['href']
		send(s, "PRIVMSG %s :%s" % (channel, "http://boards.4chan.org"+board+url))
		problem_text = details[0].find_all("blockquote",{"class":"postMessage"})
		send(s, "PRIVMSG %s :%s" % (channel, problem_text[0].text))
		fCounter = 0
		global current_board
		current_board = board
	except urllib2.HTTPError:
		send(s, "PRIVMSG %s :Throttled by tor." % channel)
Example #13
0
				my_writer.writeheader()
				
faculty_1=soup.find_all('div',{'class':"views-row"})	
for i in range(0, len(faculty_1)):
		faculty=faculty_1[i]
		member = faculty.find('a',{'class':"person-view-primary-field" })
		member_name = member.get_text()
		member_name = member_name.replace('\n', '')
		member_page = "http://polisci.wustl.edu" + member['href']
		member_title = faculty.get_text()
		member_title = member_title.split("\n")
		member_title = member_title[2]
		member_title = member_title.replace(' ', '')
		#member_page = member_page.replace('\n', ' ')
		member_page_proper = urllib2.urlopen(member_page)
		member_page_proper = BeautifulSoup(member_page_proper.read())
		member_specialization_spots = member_page_proper.find_all('a',{'property':"rdfs:label skos:prefLabel"})	
		for i in member_specialization_spots:
			fields = ["Political Theory", "American", "Methodology", "Comparative", "International Political Economy", "Formal Theory"]
			if i.get_text() in fields:
				member_specialization = i.get_text()
		page_email_spot = member_page_proper.find("div",{'class':"field field-name-field-person-email field-type-email field-label-inline clearfix"})
		page_email_spot_2 = page_email_spot.find_all("div", {"class" : "field-item even"})
		for i in page_email_spot_2:
			if i.find("a",href=True):
				page_email = i.get_text()
		page_website_spot=member_page_proper.find('div',{'class':"field field-name-field-person-website field-type-link-field field-label-inline clearfix"})
		if page_website_spot:
			page_website = page_website_spot.find("a",href=True)['href']
		else:
			page_website = member_page
Example #14
0
try:
    # html = urlopen("https://site_nao_existe.com.br") # Forçando erro URLError
    # html = urlopen("http://pythonscraping.com/pages/page1.html") # Forçando o sucesso
    html = urlopen(
        "http://pythonscraping.com/pages/page1.html")  # Forçando o sucesso
except HTTPError as e:
    print(e)
except URLError as e:
    print("Servidor não encontrado")
else:
    print("Servidor encontrado!")
    '''
        Existem situações que o site foi encontrado e esta ok, porém o conteudo do site pode 
        não ser o esperado. Neste cenário é impressindivel validar o conteudo com o BS4
    '''
    html = BeautifulSoup(html.read(), "html.parser")
    ''' Se uma tag não existir, o bs4 retorna None por padrão '''
    print(html.tag_inexistente)
    ''' 
        Caso tentar acessar uma tag dentro de uma tag que não exista, ou seja (None), uma 
        AttributeError sera lançada
    '''
    try:
        print(html.tag_inexistente.qualquer_outra_tag)
    except AttributeError as e:
        print("Tag inexistente")
    ''' Tratamento recomendado com exemplo de captura da tag H1 do site alvo: '''
    try:
        conteudo = html.h1
    except AttributeError as e:
        print("Tag inexistente")
Example #15
0
prepareFile.close()
f = open('document.txt')
f.read()

# In[16]:

f = open('document.txt', 'r')
for line in f:
    print(line.strip())

# In[17]:

# Finding nltk data
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'r')
raw.read()

# ### Capturing User Input

# In[18]:

s = input("Enter some text")

# In[19]:

print("You typed {0} words.".format(len(word_tokenize(s))))

# ### The NLP Pipeline

# In[20]:
Example #16
0
def Summary():
    k = 0
    Summary = pd.DataFrame()
    ######    WTI    ####
    url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1'
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2'
    html2 = urlopen(url2)
    src2 = BeautifulSoup(html2.read(), "html.parser")

    url3 = 'https://finance.naver.com//marketindex/exchangeDailyQuote.naver?marketindexCd=FX_USDKRW'
    html3 = urlopen(url3)
    src3 = BeautifulSoup(html3.read(), "html.parser")
    date_USD = src3.find_all(class_="date")
    USD = src3.find_all(class_="num")

    date1 = src1.find_all(class_="date")
    date2 = src2.find_all(class_="date")
    date = date1 + date2

    WTI1 = src1.find_all(class_="num")
    WTI2 = src2.find_all(class_="num")
    WTI = WTI1 + WTI2
    # print(WTI[1])
    for k in range(len(date)):

        if date_USD[0].text.strip() == date[0].text.strip():
            # print(str(WTI[k*2+1])[str(WTI[k*2+1]).find("alt=")+5:str(WTI[k*2+1]).find("alt=")+7])
            if str(WTI[k * 2 +
                       1])[str(WTI[k * 2 + 1]).find("alt=") +
                           5:str(WTI[k * 2 + 1]).find("alt=") + 7] == "하락":
                Summary.loc[k + 1, ['일자']] = date[k].text.strip()
                Summary.loc[k + 1, ['WTI']] = WTI[
                    k * 3].text.strip() + '/-' + WTI[k * 3 + 2].text.strip()
            else:
                Summary.loc[k + 1, ['일자']] = date[k].text.strip()
                Summary.loc[k + 1, ['WTI']] = WTI[
                    k * 3].text.strip() + '/' + WTI[k * 3 + 2].text.strip()

        else:
            # print(str(WTI[k*2+1])[str(WTI[k*2+1]).find("alt=")+5:str(WTI[k*2+1]).find("alt=")+7])
            if str(WTI[k * 2 +
                       1])[str(WTI[k * 3 + 2]).find("alt=") +
                           5:str(WTI[k * 2 + 1]).find("alt=") + 7] == "하락":
                Summary.loc[0, ['일자']] = date_USD[0].text.strip()
                Summary.loc[k + 1, ['일자']] = date[k].text.strip()
                Summary.loc[k + 1, ['WTI']] = WTI[
                    k * 3].text.strip() + '/-' + WTI[k * 3 + 2].text.strip()
            else:
                Summary.loc[0, ['일자']] = date_USD[0].text.strip()
                Summary.loc[k + 1, ['일자']] = date[k].text.strip()
                Summary.loc[k + 1, ['WTI']] = WTI[
                    k * 3].text.strip() + '/' + WTI[k * 3 + 2].text.strip()

    ######    환율(USD)    ####

    for k in range(len(date)):
        for m in range(len(date_USD)):
            if date_USD[m].text.strip() == list(Summary.loc[k, ['일자']])[0]:
                # print(str(USD[m*2+1])[str(USD[m*2+1]).find("alt=")+4:str(USD[m*2+1]).find("alt=")+8])
                if str(USD[m * 2 +
                           1])[str(USD[m * 2 + 1]).find("alt=") +
                               5:str(USD[m * 2 + 1]).find("alt=") + 7] == "하락":
                    Summary.loc[k, ['환율(USD)']] = USD[m * 2].text + '/-' + USD[
                        m * 2 + 1].text.strip()
                else:
                    Summary.loc[k, ['환율(USD)']] = USD[m * 2].text + '/' + USD[
                        m * 2 + 1].text.strip()

            else:
                pass

    ######    미국채(10year)    ####
    url1 = 'https://kr.investing.com/rates-bonds/u.s.-10-year-bond-yield-historical-data'
    req = Request(url1, headers={'User-Agent': 'Mozila/5.0'})
    src1 = urlopen(req)
    src2 = BeautifulSoup(src1.read(), "html.parser")
    date_US_INTER10Y_list = src2.find_all(class_="first left bold noWrap")
    US_INTER10Y = src2.find_all("tbody")[0]
    for k in range(len(date)):
        for m in range(len(date_US_INTER10Y_list)):
            date_US_INTER10Y = date_US_INTER10Y_list[m].text[
                0:4] + "." + date_US_INTER10Y_list[m].text[
                    6:8] + "." + date_US_INTER10Y_list[m].text[10:12]
            # print(date_US_INTER10Y)
            # print(US_INTER10Y.find_all("tr")[m].find_all("td")[1].text)
            if date_US_INTER10Y == list(Summary.loc[k, ['일자']])[0]:
                Summary.loc[k, ['미국채(10year)']] = US_INTER10Y.find_all("tr")[
                    m].find_all("td")[1].text + '/' + US_INTER10Y.find_all(
                        "tr")[m].find_all("td")[5].text

            else:
                pass
    ######    다우    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=DJI@DJI&fdtc=0'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_DJI = src1.find_all(class_="tb_td")
    DJI = src1.find_all(class_="tb_td2")
    DJI_delta = src1.find_all(class_="tb_td3")
    DJI_UPDW = src1.find_all('tr')
    # print(DJI_UPDW[3])
    for k in range(len(date)):
        for m in range(len(date_DJI)):
            if date_DJI[m].text.strip() == list(Summary.loc[k, ['일자']])[0]:
                # print(str(DJI_UPDW[m+3])[str(DJI_UPDW[m+3]).find("=")+2:str(DJI_UPDW[m+3]).find("=")+10])
                if str(DJI_UPDW[m + 3])[str(DJI_UPDW[m + 3]).find("=") +
                                        2:str(DJI_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k, ['다우']] = DJI[m].text + '/-' + DJI_delta[
                        m].text.strip()
                else:
                    Summary.loc[k, ['다우']] = DJI[m].text + '/' + DJI_delta[
                        m].text.strip()

            else:
                pass

    ######    나스닥    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@IXIC'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_NAS = src1.find_all(class_="tb_td")
    NAS = src1.find_all(class_="tb_td2")
    NAS_delta = src1.find_all(class_="tb_td3")
    NAS_UPDW = src1.find_all('tr')
    # print(NAS_UPDW[3])
    for k in range(len(date)):
        for m in range(len(date_NAS)):
            if date_NAS[m].text.strip() == list(Summary.loc[k, ['일자']])[0]:
                # print(str(NAS_UPDW[m+3])[str(NAS_UPDW[m+3]).find("=")+2:str(NAS_UPDW[m+3]).find("=")+10])
                if str(NAS_UPDW[m + 3])[str(NAS_UPDW[m + 3]).find("=") +
                                        2:str(NAS_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k, ['나스닥']] = NAS[m].text + '/-' + NAS_delta[
                        m].text.strip()
                else:
                    Summary.loc[k, ['나스닥']] = NAS[m].text + '/' + NAS_delta[
                        m].text.strip()

            else:
                pass

    ######    SNP    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=SPI@SPX'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_SNP = src1.find_all(class_="tb_td")
    SNP = src1.find_all(class_="tb_td2")
    SNP_delta = src1.find_all(class_="tb_td3")
    SNP_UPDW = src1.find_all('tr')
    # print(SNP_UPDW[3])
    for k in range(len(date)):
        for m in range(len(date_SNP)):
            if date_SNP[m].text.strip() == list(Summary.loc[k, ['일자']])[0]:
                # print(str(SNP_UPDW[m+3])[str(SNP_UPDW[m+3]).find("=")+2:str(SNP_UPDW[m+3]).find("=")+10])
                if str(SNP_UPDW[m + 3])[str(SNP_UPDW[m + 3]).find("=") +
                                        2:str(SNP_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k, ['S&P']] = SNP[m].text + '/-' + SNP_delta[
                        m].text.strip()
                else:
                    Summary.loc[k, ['S&P']] = SNP[m].text + '/' + SNP_delta[
                        m].text.strip()

            else:
                pass

    ######    PDM    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@SOX'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_PDM = src1.find_all(class_="tb_td")
    PDM = src1.find_all(class_="tb_td2")
    PDM_delta = src1.find_all(class_="tb_td3")
    PDM_UPDW = src1.find_all('tr')
    # print(PDM_UPDW[3])
    for k in range(len(date)):
        for m in range(len(date_PDM)):
            if date_PDM[m].text.strip() == list(Summary.loc[k, ['일자']])[0]:
                # print(str(PDM_UPDW[m+3])[str(PDM_UPDW[m+3]).find("=")+2:str(PDM_UPDW[m+3]).find("=")+10])
                if str(PDM_UPDW[m + 3])[str(PDM_UPDW[m + 3]).find("=") +
                                        2:str(PDM_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k, ['PDM']] = PDM[m].text + '/-' + PDM_delta[
                        m].text.strip()
                else:
                    Summary.loc[k, ['PDM']] = PDM[m].text + '/' + PDM_delta[
                        m].text.strip()

            else:
                pass

    print(Summary)
    Summary_html = Summary.to_html(index=False, justify='center')
    s = smtplib.SMTP('smtp.gmail.com', 587)
    s.starttls()
    s.login('*****@*****.**', 'thdfcvhemyjyxfik')
    msg = MIMEText(Summary_html, 'html')
    msg['Subject'] = '주요시장지표'
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**",
               msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.quit()
    return Summary
Example #17
0
def getHTMLsoup(url):
    HTMLsoup = urllib2.urlopen(url)
    HTMLsoup = BeautifulSoup(HTMLsoup.read())
    return HTMLsoup
Example #18
0
class File(Archive):
    def __init__(self, _path, _logger=None):
        Archive.__init__(self, _path, _logger)
        self.type = Types.FOLDER
        self.content = None

    @property
    def title(self):
        name = self.name
        title = os.path.splitext(name)[0]
        return title

    @property
    def extension(self):
        name = self.name
        extension = os.path.splitext(name)[1]
        return extension.upper()

    def exist(self):
        exist = super().exist(Types.FILE)
        return exist

    def delete(self):
        self.logger.info(f"Deleting {self.name} on " f"in {self.parent_path}.")
        os.remove(self.path)

    def read(self):
        if self.extension.upper() == '.XML':
            try:
                file_obj = open(self.path, 'r')
                self.content = BeautifulSoup(
                    file_obj.read(),
                    features=self.extension.lower().replace('.', ''))

            except Exception as e:
                raise NameError(str(e))

        elif self.extension.upper() in ['.PNG', '.JPG', '.JPEG']:
            try:
                self.content = open(self.path, 'rb')

            except Exception as e:
                raise NameError(str(e))

        else:
            msg_error = f'Extensión {self.extension} aun no soportada'
            self.logger.error(msg_error)
            raise NameError(msg_error)

    def save(self):
        if self.extension.upper() in ['.XML', '.HTML']:
            file_obj = open(self.path, mode="w", encoding="utf-8")
            file_obj.write(self.content)
            file_obj.close()

        if self.extension.upper() in ['.PNG', '.JPG', '.JPEG']:
            file_obj = open(self.path, mode="wb")
            file_obj.write(self.content.read())
            file_obj.close()

        else:
            msg_error = f'Extensión {self.extension} aun no soportada'
            self.logger.error(msg_error)
            raise NameError(msg_error)
Example #19
0
    my_writer.writeheader()

faculty_1 = soup.find_all('div', {'class': "views-row"})
for i in range(0, len(faculty_1)):
    faculty = faculty_1[i]
    member = faculty.find('a', {'class': "person-view-primary-field"})
    member_name = member.get_text()
    member_name = member_name.replace('\n', '')
    member_page = "http://polisci.wustl.edu" + member['href']
    member_title = faculty.get_text()
    member_title = member_title.split("\n")
    member_title = member_title[2]
    member_title = member_title.replace(' ', '')
    #member_page = member_page.replace('\n', ' ')
    member_page_proper = urllib2.urlopen(member_page)
    member_page_proper = BeautifulSoup(member_page_proper.read())
    member_specialization_spots = member_page_proper.find_all(
        'a', {'property': "rdfs:label skos:prefLabel"})
    for i in member_specialization_spots:
        fields = [
            "Political Theory", "American", "Methodology", "Comparative",
            "International Political Economy", "Formal Theory"
        ]
        if i.get_text() in fields:
            member_specialization = i.get_text()
    page_email_spot = member_page_proper.find(
        "div", {
            'class':
            "field field-name-field-person-email field-type-email field-label-inline clearfix"
        })
    page_email_spot_2 = page_email_spot.find_all("div",
Example #20
0
def Summary():
    k = 0
    Summary = pd.DataFrame()
    ######    WTI    ####
    url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1'
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2'
    html2 = urlopen(url2)
    src2 = BeautifulSoup(html2.read(), "html.parser")

    date1 = src1.find_all(class_="date")
    date2 = src2.find_all(class_="date")
    date_WTI = date1 + date2

    WTI1 = src1.find_all(class_="num")
    WTI2 = src2.find_all(class_="num")
    WTI = WTI1 + WTI2

    date = datetime.date.today()
    for k in range(20):
        if k == 0:
            Summary.loc[k, ['일자']] = datetime.date.strftime(date, '%Y.%m.%d')

        else:
            Summary.loc[k, ['일자']] = datetime.date.strftime(
                date + pd.DateOffset(days=-k), '%Y.%m.%d')

    m = 0
    for m in range(10):
        for k in range(len(Summary['일자'])):

            if date_WTI[m].text.strip() == Summary['일자'].iloc[k]:
                Summary.loc[k, ['WTI']] = WTI[m * 3].text.strip() + '/' + WTI[
                    m * 3 + 2].text.strip()

            else:
                pass

    ######    VIX    ####
    url1 = 'https://kr.investing.com/indices/volatility-s-p-500-historical-data'
    req = Request(url1, headers={'User-Agent': 'Mozila/5.0'})
    src1 = urlopen(req)
    src2 = BeautifulSoup(src1.read(), "html.parser")
    date_vix = src2.find_all(class_="first left bold noWrap")
    vix = src2.find_all("tbody")[1]

    for k in range(len(Summary['일자'])):
        for m in range(len(date_vix)):
            print("date", date_vix, date_vix[m], m)
            date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[
                6:8] + "." + date_vix[m].text[10:12]

            if date_vix_list == Summary['일자'].iloc[k]:
                Summary.loc[k, ['VIX']] = vix.find_all("tr")[m].find_all(
                    "td")[1].text + '/' + vix.find_all("tr")[m].find_all(
                        "td")[6].text

            else:
                pass
    print(Summary)
Example #21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-09-23 17:11:12
# @Author  : Your Name ([email protected])
# @Link    : http://example.org
# @Version : $Id$

from urllib.request import urlopen
from urllib.error import URLError, HTTPError
from bs4 import BeautifulSoup

# GET

values = {}
values['username'] = "******"
values['password'] = "******"
# data = urllib.urlencode(values)

url = "http://passport.csdn.net/account/login"
geturl = url + "?" + data
print(geturl)

html = urlopen(geturl)
html = BeautifulSoup(html.read(), 'lxml')
print(html)
dataFile = open("news20081001-20081031.txt", "w", encoding="utf-8")
from urllib import request
mediaList = []
hrefSet = set()  #排除重复链接
errorHref = "http://tech.sina.com.cn/mobile/n/2008-10-21/0027842330.shtml"  #直接提取正文在我的电脑上会发生MemoryError,因此单独提出来在line63-71处理
from bs4 import BeautifulSoup

for i in range(1, 32):  #得出31个日期
    if i < 10:
        date = "0" + str(i)
    else:
        date = str(i)

    url = request.urlopen("http://news.sina.com.cn/hotnews/200810" + date +
                          ".shtml")
    url = BeautifulSoup(url.read().decode("gb18030"), "html.parser")
    charNum = 0  #计算字数
    imgList = []  #计算图片数量

    url = str(url.find_all("a")).split(",")
    hrefs = ""

    for link in url:
        if "comment" not in link:
            hrefs += link + "\n"

    hrefs = hrefs[hrefs.find(""" <a name="2"></a>"""):hrefs.rfind(
        """<a href="/guest.html" target="_blank">新闻中心意见反馈留言板""")]

    hrefList = hrefs.split("\n")
    hrefList = hrefList[2:]
def getHTMLsoup(url):
    HTMLsoup = urllib2.urlopen(url)
    HTMLsoup = BeautifulSoup(HTMLsoup.read()) 
    return HTMLsoup
Example #24
0
def Summary():
    k = 0
    Summary = pd.DataFrame()
    ######    WTI    ####
    url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1'
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2'
    html2 = urlopen(url2)
    src2 = BeautifulSoup(html2.read(), "html.parser")

    date1 = src1.find_all(class_="date")
    date2 = src2.find_all(class_="date")
    date_WTI = date1 + date2

    WTI1 = src1.find_all(class_="num")
    WTI2 = src2.find_all(class_="num")
    WTI = WTI1 + WTI2

    date = datetime.date.today()
    for k in range(20):
        if k == 0:
            Summary.loc[k, ['일자']] = datetime.date.strftime(date, '%Y.%m.%d')

        else:
            Summary.loc[k, ['일자']] = datetime.date.strftime(
                date + pd.DateOffset(days=-k), '%Y.%m.%d')

    m = 0
    for m in range(10):
        for k in range(len(Summary['일자'])):
            # print(date_WTI[m].text.strip(),len(Summary['일자'].iloc[k]))
            if date_WTI[m].text.strip() == Summary['일자'].iloc[k]:
                Summary.loc[k, ['WTI']] = WTI[m * 3].text.strip() + '/' + WTI[
                    m * 3 + 2].text.strip()
                # print(Summary.loc[k,['WTI']])
            else:
                pass

    ######    환율(USD)    ####
    url1 = 'https://finance.naver.com//marketindex/exchangeDailyQuote.naver?marketindexCd=FX_USDKRW'
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_USD = src1.find_all(class_="date")
    USD = src1.find_all(class_="num")

    m = 0
    for m in range(10):
        for k in range(len(Summary['일자'])):
            if date_USD[m].text.strip() == Summary['일자'].iloc[k]:
                # print(m,k,date_USD[m].text.strip(),Summary['일자'].iloc[k],len(Summary['일자']))
                if str(USD[m * 2 +
                           1])[str(USD[m * 2 + 1]).find("alt=") +
                               5:str(USD[m * 2 + 1]).find("alt=") + 7] == "하락":
                    Summary.loc[k, ['USD']] = USD[m * 2].text.strip(
                    ) + '/-' + USD[m * 2 + 1].text.strip()

                else:
                    Summary.loc[k,
                                ['USD']] = USD[m * 2].text.strip() + '/' + USD[
                                    m * 2 + 1].text.strip()

            else:
                pass

    ######    미국채(10year)    ####
    url1 = 'https://kr.investing.com/rates-bonds/u.s.-10-year-bond-yield-historical-data'
    req = Request(url1, headers={'User-Agent': 'Mozila/5.0'})
    src1 = urlopen(req)
    src2 = BeautifulSoup(src1.read(), "html.parser")
    date_US_INTER10Y_list = src2.find_all(class_="first left bold noWrap")
    US_INTER10Y = src2.find_all("tbody")[0]

    for k in range(len(Summary['일자'])):
        for m in range(len(date_US_INTER10Y_list)):
            date_US_INTER10Y = date_US_INTER10Y_list[m].text[
                0:4] + "." + date_US_INTER10Y_list[m].text[
                    6:8] + "." + date_US_INTER10Y_list[m].text[10:12]
            # print(date_US_INTER10Y,Summary['일자'].iloc[k])
            if date_US_INTER10Y == Summary['일자'].iloc[k]:
                Summary.loc[k, ['미국채(10year)']] = US_INTER10Y.find_all("tr")[
                    m].find_all("td")[1].text + '/' + US_INTER10Y.find_all(
                        "tr")[m].find_all("td")[5].text

            else:
                pass

    ######    다우    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=DJI@DJI'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_DJI = src1.find_all(class_="tb_td")
    DJI = src1.find_all(class_="tb_td2")
    DJI_delta = src1.find_all(class_="tb_td3")
    DJI_UPDW = src1.find_all('tr')

    m = 0
    for k in range(len(Summary['일자'])):
        if m + 1 != 11:
            # print(date_DJI[m+1].text.strip(),Summary['일자'].iloc[k])
            if date_DJI[m + 1].text.strip() == Summary['일자'].iloc[k]:
                # print('dji',str(DJI_UPDW[m+3])[str(DJI_UPDW[m+3]).find("=")+2:str(DJI_UPDW[m+3]).find("=")+10])
                if str(DJI_UPDW[m + 3])[str(DJI_UPDW[m + 3]).find("=") +
                                        2:str(DJI_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k - 1, ['다우']] = DJI[
                        m + 1].text + '/-' + DJI_delta[m + 1].text.strip()
                    # print(Summary.loc[k-1,['다우']],DJI[m+1].text+'/-'+DJI_delta[m+1].text.strip())
                    m = m + 1
                else:
                    Summary.loc[k - 1, ['다우']] = DJI[
                        m + 1].text + '/' + DJI_delta[m + 1].text.strip()
                    # print(Summary.loc[k-1,['다우']],DJI[m+1].text+'/'+DJI_delta[m+1].text.strip())
                    m = m + 1

            else:
                pass
        else:
            break

    ######    나스닥    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@IXIC'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_NAS = src1.find_all(class_="tb_td")
    NAS = src1.find_all(class_="tb_td2")
    NAS_delta = src1.find_all(class_="tb_td3")
    NAS_UPDW = src1.find_all('tr')

    m = 0
    for k in range(len(Summary['일자'])):
        if m + 1 != 11:
            if date_NAS[m + 1].text.strip() == Summary['일자'].iloc[k]:
                # print('nas',str(NAS_UPDW[m+3])[str(NAS_UPDW[m+3]).find("=")+2:str(NAS_UPDW[m+3]).find("=")+10])
                if str(NAS_UPDW[m + 3])[str(NAS_UPDW[m + 3]).find("=") +
                                        2:str(NAS_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k - 1, ['나스닥']] = NAS[
                        m + 1].text + '/-' + NAS_delta[m + 1].text.strip()
                    m = m + 1
                else:
                    Summary.loc[k - 1, ['나스닥']] = NAS[
                        m + 1].text + '/' + NAS_delta[m + 1].text.strip()
                    m = m + 1

            else:
                pass
        else:
            break

    ######    SNP    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=SPI@SPX'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_SNP = src1.find_all(class_="tb_td")
    SNP = src1.find_all(class_="tb_td2")
    SNP_delta = src1.find_all(class_="tb_td3")
    SNP_UPDW = src1.find_all('tr')

    m = 0
    for k in range(len(Summary['일자'])):
        if m + 1 != 11:
            if date_SNP[m + 1].text.strip() == Summary['일자'].iloc[k]:
                # print(str(SNP_UPDW[m+3])[str(SNP_UPDW[m+3]).find("=")+2:str(SNP_UPDW[m+3]).find("=")+10])
                if str(SNP_UPDW[m + 3])[str(SNP_UPDW[m + 3]).find("=") +
                                        2:str(SNP_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k - 1, ['S&P']] = SNP[
                        m + 1].text + '/-' + SNP_delta[m + 1].text.strip()
                    m = m + 1
                else:
                    Summary.loc[k - 1, ['S&P']] = SNP[
                        m + 1].text + '/' + SNP_delta[m + 1].text.strip()
                    m = m + 1

            else:
                pass
        else:
            break

    ######    PDM    ####
    url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@SOX'
    # req=Request(url1,headers={'User-Agent':'Mozila/5.0'})
    html1 = urlopen(url1)
    src1 = BeautifulSoup(html1.read(), "html.parser")
    date_PDM = src1.find_all(class_="tb_td")
    PDM = src1.find_all(class_="tb_td2")
    PDM_delta = src1.find_all(class_="tb_td3")
    PDM_UPDW = src1.find_all('tr')

    m = 0
    for k in range(len(Summary['일자'])):
        if m + 1 != 11:
            if date_PDM[m + 1].text.strip() == Summary['일자'].iloc[k]:
                # print('pdm',str(PDM_UPDW[m+3])[str(PDM_UPDW[m+3]).find("=")+2:str(PDM_UPDW[m+3]).find("=")+10])
                if str(PDM_UPDW[m + 3])[str(PDM_UPDW[m + 3]).find("=") +
                                        2:str(PDM_UPDW[m + 3]).find("=") +
                                        10] == "point_dn":
                    Summary.loc[k - 1, ['PDM']] = PDM[
                        m + 1].text + '/-' + PDM_delta[m + 1].text.strip()
                    m = m + 1
                else:
                    Summary.loc[k - 1, ['PDM']] = PDM[
                        m + 1].text + '/' + PDM_delta[m + 1].text.strip()
                    m = m + 1

            else:
                pass
        else:
            break

    ######    VIX(S&P500)    ####
    url1 = 'https://kr.investing.com/indices/volatility-s-p-500-historical-data'
    req = Request(url1, headers={'User-Agent': 'Mozila/5.0'})
    src1 = urlopen(req)
    src2 = BeautifulSoup(src1.read(), "html.parser")
    date_vix = src2.find_all(class_="first left bold noWrap")
    vix = src2.find_all("tbody")[1]

    for k in range(len(Summary['일자'])):
        for m in range(len(date_vix)):
            # print("date",date_vix,date_vix[m],m)
            date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[
                6:8] + "." + date_vix[m].text[10:12]

            if date_vix_list == Summary['일자'].iloc[k]:
                Summary.loc[k, ['VIX(S&P)']] = vix.find_all("tr")[m].find_all(
                    "td")[1].text + '/' + vix.find_all("tr")[m].find_all(
                        "td")[6].text

            else:
                pass

    ######    VIX(KOSPI)    ####
    url1 = 'https://kr.investing.com/indices/kospi-volatility-historical-data'
    req = Request(url1, headers={'User-Agent': 'Mozila/5.0'})
    src1 = urlopen(req)
    src2 = BeautifulSoup(src1.read(), "html.parser")
    date_vix = src2.find_all(class_="first left bold noWrap")
    vix = src2.find_all("tbody")[0]

    for k in range(len(Summary['일자'])):
        for m in range(len(date_vix)):
            # print("date",date_vix,date_vix[m],m)
            date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[
                6:8] + "." + date_vix[m].text[10:12]

            if date_vix_list == Summary['일자'].iloc[k]:
                Summary.loc[k, ['VIX(KOSPI)']] = vix.find_all(
                    "tr")[m].find_all("td")[1].text + '/' + vix.find_all(
                        "tr")[m].find_all("td")[6].text

            else:
                pass

    Summary = Summary.fillna('-')
    # print(Summary)
    Summary_html = Summary.to_html(index=False, justify='center')
    s = smtplib.SMTP('smtp.gmail.com', 587)
    s.starttls()
    s.login('*****@*****.**', 'thdfcvhemyjyxfik')
    msg = MIMEText(Summary_html, 'html')
    msg['Subject'] = '주요시장지표'
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**",
               msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string())
    s.quit()
    return Summary
Example #25
0
    def scraper(self):
        try:
            self.text(92, "\n\n   BOT : Starting Wikipedia Crawl...\n")
            self.text(92, "\n   BOT : Crawling...\n")
            self.log.info("BOT : Scraper Function Started")
            r = self.r_session.get(self.host, headers=self.browser_headers)
            data = r.content
            # LXML ERROR MAY RISE UNCAUGHT CHECK WHEN OS CHANGE
            parse = BeautifulSoup(str(data), 'lxml', from_encoding="utf-8")
            div = parse.find("div", {"id": "mp-otd"})
            hot_link = div.findAll('li')
            for elem, links in enumerate(hot_link):

                if elem == 0 or elem == 1 or elem == 2 or elem == 3 or elem == 4:
                    year = re.findall(r'[0-9]{3,4}', links.text)
                    # Without Year Content
                    # content = re.sub(r'[0-9]{4}','',links.text)
                    # With Year Content
                    content = str(links.text.encode('ascii', 'ignore'))
                    # BULK LINK EMPTY ARRAY INITIALISED ONLY
                    bulk_link = []
                    inner_link = links.findAll('a')
                    inner_link.pop(0)
                    for links in inner_link:
                        temp = BeautifulSoup(str(links), 'lxml')
                        title = temp.a['title']
                        link = temp.a['href']
                        bulk_link.append({
                            'title': title,
                            'url': link,
                            'img_w': False,
                            'img_h': False,
                            'img_size': False,
                            'ext_type': False,
                            'img': False
                        })

                    # Certain Days Wikipedia Tends To Shows Less Data, Fail Safe Check | Break And Continue
                    try:
                        self.scrape_set.update({
                            elem: {
                                'year': year[0],
                                'content': str(content),
                                'links': bulk_link
                            }
                        })
                    except:
                        break

                    with open('scraped_data/temp_scraped_data.json',
                              'wb+') as content:
                        content.write(json.dumps(self.scrape_set))
                        content.close()

            data = open('scraped_data/temp_scraped_data.json', 'r')
            content = json.loads(data.read())
            data.close()

            for key, val in sorted(content.iteritems()):

                for link in val['links']:

                    url = link['url']
                    title = str(link['title'].encode('ascii', 'ignore'))
                    # Fail Safe Encodes '#' => %23 As Requests Library Does Not Do That Automatically
                    encoded_url = re.sub('#', '%23', url.replace("/wiki/", ''))
                    r = self.r_session.get(self.img_url %
                                           (encoded_url, self.img_size_approx),
                                           headers=self.browser_headers)
                    data = json.loads(r.content)
                    # FAIL SAFE PAGE KEY ERROR WIKI API
                    # Break The Loop And Move Forward
                    if 'pages' not in data['query']:
                        self.text(
                            93,
                            "\n   BOT : Page Key Error For - %s\n") % (title)
                        self.log.error("BOT : Page Key Error For - %s") % (
                            title)
                        break

                    for k, v in data['query']['pages'].iteritems():
                        if 'thumbnail' in v:
                            thumb_url = str(v['thumbnail']['source'].encode(
                                'ascii', 'ignore'))
                            img_w = v['thumbnail']['width']
                            img_h = v['thumbnail']['height']
                            img_data = self.save_img(thumb_url, title)
                            self.temp_link.append({
                                'content_key': key,
                                'url': url,
                                'title': title,
                                'img_w': img_w,
                                'img_h': img_h,
                                'img_size': img_data[1],
                                'ext_type': img_data[2],
                                'img': img_data[0]
                            })

                        else:
                            self.text(
                                93,
                                "\n   BOT : Image Not Available For - %s (Skipping)\n"
                                % (title))
                            self.log.warn(
                                "BOT : Image Not Available For - %s (Skipping)"
                                % (title))

            temp_link_list = open('scraped_data/temp_link.json', 'wb+')
            temp_link_list.write(json.dumps({'links': self.temp_link}))
            temp_link_list.close()

            # Read Saved JSON FOR FURTHER PROCESS
            save_json = open('scraped_data/temp_scraped_data.json', 'r')
            json_content_main = json.loads(save_json.read())
            save_json.close()

            for key, val in sorted(json_content_main.iteritems()):
                temp = open('scraped_data/temp_link.json', 'r')
                content_link = json.loads(temp.read())
                temp.close()
                for link in val['links']:
                    for link_img in content_link['links']:
                        if link_img['title'] == link['title']:

                            # Replace And Re-Prepare Json With Fetched Img
                            link['img'] = link_img['img']
                            link['img_w'] = link_img['img_w']
                            link['img_h'] = link_img['img_h']
                            link['ext_type'] = link_img['ext_type']
                            link['img_size'] = link_img['img_size']

            #Write JSON (Base)
            save_json_clean = open('scraped_data/temp_scraped_data.json', 'wb')
            save_json_clean.write(json.dumps(json_content_main))
            save_json_clean.close()
            os.remove('scraped_data/temp_link.json')

            # /////////////////////////////////////////////////////////////////////////////////
            # Highlighted On This Day
            # ////////////////////////////////////////////////////////////////////////////////

            temp_highlight_json = {}
            r = self.r_session.get(self.host, headers=self.browser_headers)
            data = r.content
            parse = BeautifulSoup(str(data), 'lxml', from_encoding='utf-8')
            div = parse.find("div", {"id": "mp-otd"})
            hot_link = BeautifulSoup(str(div.find('p')),
                                     'lxml',
                                     from_encoding='utf-8')

            # Highlight Day And Contents Of It Grab Here
            highlight_day = str(hot_link.getText()).split(':')[0]
            highlight_content = str(hot_link.getText())

            # ---------------------
            # MAIN SEPERATOR
            # (CHANGE HERE IF U REQUIRE ONLY ONE ANCHOR TAG FROM FIRST <P> HIGHLIGHT ELEM)
            # ----------------------
            # ->Fetch Only 1st Elem ->># str(hot_link.html.body.p.b.a.contents[0])
            # ---------------------
            # HIGHLIGHT DIV <P> ELEM FILTER LOGIC
            # ---------------------
            # GRAB ALL ANCHOR IN HIGHLIGHT DIV <P> FIRST HIGHLIGHT TAG
            # POP 0 ELEM AS IT'S YEAR AND ALSO IGNORE YEAR LINKS
            # ---------------------

            highlight_data = list(hot_link.html.body.p.findAll('a', href=True))
            highlight_data.pop(0)

            # print highlight_data

            # TEMP Highlight Holder
            temp_highlight_link = []
            # Cleaned Highlight HOLDER
            highlight_link = []

            for link in highlight_data:
                temp = BeautifulSoup(str(link), 'lxml', from_encoding='utf-8')
                title = temp.a['title']
                link = temp.a['href']
                year = re.search(r"/wiki/[0-9]{3,4}", str(link))

                # IMP ----------
                # Ignoring Year Links In Highlight Todays Div <P> FIRST TAG
                if not year:
                    temp_highlight_link.append({'title': title, 'url': link})

            for link in temp_highlight_link:
                # Link
                title = link['title']
                url = link['url']
                # Fail Safe Encodes '#' => %23 As Requests Library Does Not Do That Automatically
                encoded_url = re.sub('#', '%23', url.replace("/wiki/", ''))
                r = self.r_session.get(self.img_url %
                                       (encoded_url, self.img_size_approx),
                                       headers=self.browser_headers)
                data = json.loads(r.content)

                # print str(data)+str('\n')

                # FAIL SAFE PAGE KEY ERROR WIKI API
                # Break The Loop And Move Forward
                if 'pages' not in data['query']:
                    self.text(93,
                              "\n   BOT : Page Key Error For - %s\n") % (title)
                    self.log.error("BOT : Page Key Error For - %s") % (title)
                    break

                for k, v in data['query']['pages'].iteritems():
                    if 'thumbnail' in v:
                        thumb_url = str(v['thumbnail']['source'].encode(
                            'ascii', 'ignore'))
                        img_w = v['thumbnail']['width']
                        img_h = v['thumbnail']['height']
                        img_data = self.save_img(thumb_url, title)
                        highlight_link.append({
                            'url': url,
                            'title': title,
                            'img_w': img_w,
                            'img_h': img_h,
                            'img_size': img_data[1],
                            'ext_type': img_data[2],
                            'img': img_data[0]
                        })

                    else:
                        self.text(
                            93,
                            "\n   BOT : Image Not Available For - %s (Skipping)\n"
                            % (title))
                        self.log.warn(
                            "BOT : Image Not Available For - %s (Skipping)" %
                            (title))

            # print temp_highlight_link
            # print highlight_link

            self.text(
                92, "\n   BOT : Scraping Data And Preparing Cleaned JSON...\n")

            # ----------------------------
            # Prepare Final Cleaned JSON
            # ----------------------------
            json_open_clean = open('scraped_data/temp_scraped_data.json', 'r')
            temp_link_clean_hold = json.loads(json_open_clean.read())
            json_open_clean.close()

            save_json = open(
                '%s/%s.json' % (self.todays_dir, self.todays_dir_dt_fmt), 'wb')
            temp_l = []
            for k, link in sorted(temp_link_clean_hold.iteritems()):
                temp_l.append(link)
            clean_data = {
                'timestamp': self.current_day,
                'article_day': self.article_day,
                'todays_highlight': {
                    'highlight_day': highlight_day,
                    'highlight_content': highlight_content,
                    'links': highlight_link
                },
                'year_highlight': temp_l
            }
            save_json.write(json.dumps(clean_data))
            os.remove('scraped_data/temp_scraped_data.json')

            self.log.info(
                str(time.strftime("BOT : Scraper Function Completed")))
            self.text(92, "\n   BOT : Completed...\n\n")

            return None

        except Exception, e:
            self.log.exception(e)
            self.text(91, "\nError : %s\n\n" % (e))
            print '-' * 60
            traceback.print_exc()
            print '-' * 60
            sys.exit(1)
Example #26
0
import urllib2
from bs4 import BeautifulSoup
import re
import sys

durl = "http://www.imdb.com/list/export?list_id=jwTh5Uwt2JU&author_id=ur24339561"
page = urllib2.urlopen(durl)
soup = BeautifulSoup(page)

rst = open("mvdb_const.txt", "a")
for i in re.finditer(r"\"tt\d{7}\"", soup.read()):
	print >> rst, i.group(0)
rst.close()
Example #27
0
from pathlib import Path
from bs4 import BeautifulSoup

data_folder = Path("input-extraction/")
overstock1 = data_folder / "jewelry01.html"
overstock2 = data_folder / "jewelry02.html"
rtvslo1 = data_folder / "Audi.html"
rtvslo2 = data_folder / "Volvo.html"
ceneje1 = data_folder / "PC-Ceneje.si.html"
ceneje2 = data_folder / "Kavci-Ceneje.si.html"
RacNovice1 = data_folder / "RacNovice1.html"
RacNovice2 = data_folder / "RacNovice2.html"

overstock1 = open(overstock1, "r", encoding='utf-8', errors='ignore')
overstock2 = open(overstock2, "r", encoding='utf-8', errors='ignore')
rtvslo1 = open(rtvslo1, "r", encoding='utf-8', errors='ignore')
rtvslo2 = open(rtvslo2, "r", encoding='utf-8', errors='ignore')
ceneje1 = open(ceneje1, "r", encoding='utf-8', errors='ignore')
ceneje2 = open(ceneje2, "r", encoding='utf-8', errors='ignore')
RacNovice1 = open(RacNovice1, "r", encoding='utf-8', errors='ignore')
RacNovice2 = open(RacNovice2, "r", encoding='utf-8', errors='ignore')

overstock1 = BeautifulSoup(overstock1.read(), 'html.parser')
overstock2 = BeautifulSoup(overstock2.read(), 'html.parser')
rtvslo1 = BeautifulSoup(rtvslo1.read(), 'html.parser')
rtvslo2 = BeautifulSoup(rtvslo2.read(), 'html.parser')
ceneje1 = BeautifulSoup(ceneje1.read(), 'html.parser')
ceneje2 = BeautifulSoup(ceneje2.read(), 'html.parser')
RacNovice1 = BeautifulSoup(RacNovice1.read(), 'html.parser')
RacNovice2 = BeautifulSoup(RacNovice2.read(), 'html.parser')
Example #28
0
result = opener.open(loginurl, postdata)

cookie.save(ignore_discard=True, ignore_expires=True)

# 新建csv文件对象
csv_file = open('yxdatas.csv', 'wb')
csv_writer = csv.writer(csv_file, delimiter=',')

# 处理数据进行拼接
Collegeurl = 'http://www.233.mistong.com/clpnew/index?cengci=b&cid={cid}&mid=#anchor'

cid = eval(config.ConfigIni.get_Cid())
for i in xrange(len(cid)):
    urlcollege = Collegeurl.format(cid=cid[i])
    response = opener.open(urlcollege)
    response = BeautifulSoup(response.read())
    try:
        colleges = response.select(".anaMes")[0].string
        print colleges
    except IndexError as e:
        print e
    data = response.select(".anaBg > ul > li ")
    for zklist in data:
        zkl = zklist.select("b")[0].get_text()
        print zkl
        # for i in zkl:
        #     zk = i
        #     print zk
        per = zklist.select("strong")[0].string
        print per
        csv_writer.writerow([colleges, zkl, per, urlcollege])