def fullPPtoW( review, re_level, sw_drop, stem, join_res=True ): ''' git description + __fullPPtoW__( review, re_level, sw_drop, stem, join_res=True ) : + _does_ : Computes a full "review" string pre-processing, according to ("re_level", "sw_drop", "stem") parameters + _returns_ : Treated "review" (as _list_ or _string_ depending on "join_res") + _called by_ : __fullPPtoS__, __run__, __submission.run__ + _calls_ : __reTreatment__, __rmStopWords__, __pStem__, __bs4.BeautifulSoup__ + _arguments_ : | type | name | description | | --- | --- | --- | | _string_ | review | Review to be pre-processed | | _int_ | re_level | Level of Regex treatment (0-3) | | _boolean_ | sw_drop | Should drop stop words | | _boolean_ | stem | Should apply Porter Stemming | | _boolean_ | join_res | Should return result as string (else as list of words) | ''' result = BeautifulSoup( review ) result = result.get_text().lower() result = reTreatment( result, re_level ) if sw_drop: result = rmStopword( result.split() ) if stem: result = pStem( result ) else: if stem: result = pStem( result.split() ) else: result = result.split() if join_res: return ( " ".join(result) ) else: return result
def lexical_data(html_file, encoding="utf-8"): SEP = '_ENTRY' html = open(html_file, encoding=encoding).read() html = re.sub(r'<p', SEP + '<p', html) text = BeautifulSoup(html).get_text() text = ' '.join(text.split()) for entry in text.split(SEP): if entry.count(' ') > 2: yield entry.split(' ', 3)
def clean_sentence(self,sentence): if self.html_clean: sentence = BeautifulSoup(sentence).get_text() # removing html markup sentence = sentence.lower() # everything to lowercase # sentence = ''.join(x for x in sentence if x.isalnum() or x==" ") for ch_rep in self.clean_list: sentence = re.sub(ch_rep[0],ch_rep[1],sentence) sentence = ' '.join(filter(lambda x:x not in self.stopwords_eng,sentence.split())) sentence = ' '.join(filter(lambda x:len(x) > 1,sentence.split())) sentence = sentence.strip(" ") # Remove possible extra spaces if self.split_words: sentence = sentence.split() return sentence
def clean_data(dados): only_table_tags = SoupStrainer("td") #Define a Tag parseada soup = BeautifulSoup(dados, 'html.parser', parse_only=only_table_tags) soup = soup.get_text('|', strip=True) valor = [valor.replace(",", ".") for valor in soup.split('|')] date = datetime.strptime(valor[0], "%d/%m/%Y") valor[0] = date.strftime("%Y/%m/%d") #A trabalhar ###### # try: # cursor.execute(""" # INSERT INTO dollar (dollar_dia, dollar_compra, dollar_venda) # VALUES (%s, %s, %s) # ON DUPLICATE KEY UPDATE dollar_dia=%s",(str(valor[0]), str(valor[1]), str(valor[2]), str(valor[0]))""") # except mariadb.Error as error: # print("Error: {}".format(error)) # mariadb_connection.commit() #Otimizar as consultas! if cursor.execute("SELECT * FROM dollar WHERE dollar_dia = %s", (valor[0])) < 1: try: cursor.execute("INSERT INTO dollar (dollar_dia, dollar_compra, dollar_venda) VALUES (%s, %s, %s)",(str(valor[0]), str(valor[1]), str(valor[2]))) except mariadb.Error as error: print("Error: {}".format(error)) else: try: cursor.execute("UPDATE dollar SET dollar_compra = %s, dollar_venda=%s WHERE dollar_dia = %s",(str(valor[1]), str(valor[2]), str(valor[0]))) except mariadb.Error as error: print("Error: {}".format(error)) mariadb_connection.commit()
def process_strings( string ): # 1. Remove HTML words = BeautifulSoup(string).get_text() # separate joint words words = re.sub('(\w+)([A-Z][a-z]+)',lambda m: " " + m.group(1) +\ " " + m.group(2), words ) # 3. Convert to lower case words = words.lower() # remove unwanted characters ddd = re.sub('[^a-zA-Z0-9\s]', " ", words ) ddd2 = re.sub( "(\d+)x(\d+)", lambda m: m.group(1) + " " + m.group(2) , ddd ) ddd3 = re.sub( "(\d+)x\s", lambda m: m.group(1) + " ", ddd2 ) ddd4 = re.sub( "\sx(\d+)", lambda m: " " + m.group(1), ddd3 ) ddd5 = re.sub( "\sx\s", " " , ddd4 ) fff = re.sub( "(\D+)(\d+)", lambda m: m.group(1) + " " + m.group(2), ddd5 ) fff2 = re.sub( "(\d+)(\D+)", lambda m: m.group(1) + " " + m.group(2), fff ) words = re.sub( "(\d+)(\D+)(\d+)", lambda m: m.group(1) + " " + m.group(2) + " " \ + m.group(3), fff2) for i in range(1,10): words = re.sub('\s(ft|sq|in|gal|cu|h|oz|dia|yd|yds|a|p|qt|ah|amp|gpm|mp\ |quart|watt|cc|d|inc|incl|lb|lbs|lin|ln|mil|mm|no|n|oc\ |od|pc|pal|pt|s|sch|cs|case|pallet|w)\s' , lambda m: " ", words ) # Join the words back into one string separated by space return ( words.split() )
def clean_str(review_docs, method=2): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ output_docs = [] if(method == 1): for string in review_docs: string = BeautifulSoup(string, "lxml").get_text() string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) string = string.strip().lower() string = string.split(" ") output_docs.append(string) elif(method==2): for string in review_docs: words = gensim.utils.to_unicode(string).split() output_docs.append(words) return output_docs
def reviewToWordList(rawReview, removeStopWords = False): """ Converts a document to sequence of words optionally removing stop words will later extend to optionally remove numbers I/O -Input: raw html in string form -Output: list of words """ #Remove HTML cleanedReview = BeautifulSoup(rawReview).get_text() #Remove non-letters cleanedReview = re.sub("[^a-zA-Z]", " ", cleanedReview) #Convert words to lowerCase cleanedReview = cleanedReview.lower() #Split Words wordList = cleanedReview.split() #Optionally remove stop words if ( removeStopWords ): stops = set(stopwords.words('english')) wordList = [ word for word in wordList if word not in stops] #Return list of words return(wordList)
def reviewToWords(rawReview): """ Converts raw review to a string of words -Input is single html string -Output is preprocessed single string """ cleanedReview = None #Remove HTML cleanedReview = BeautifulSoup(rawReview) #Remove numbers and punctuation cleanedReview = re.sub("[^a-zA-Z]", " ", cleanedReview.get_text()) #Make all words lowercase cleanedReview = cleanedReview.lower() #Split into individual words cleanedReviewWords = cleanedReview.split() #Convert to set instead of list for efficiency stops = set(stopwords.words("english")) #Remove stop words meaningfulWords = [word for word in cleanedReviewWords if word not in stops] #Join words back into one string return (" ".join( meaningfulWords ))
def get_data_for_vine_id(self, vine_id, timeout=30): try: page = requests.get("https://vine.co/v/{}".format(vine_id), timeout=timeout) except requests.exceptions.RequestException as e: error_message = "Problem with comminicating with vine page - {}".format(e) raise PresserRequestError(error_message) if page.ok: content = BeautifulSoup(page.content) all_script_tags = content.find_all("script") potential_script_tags = [script for script in all_script_tags if not script.has_attr("src")] script_lines = [] for tag in potential_script_tags: for content in tag.contents: for line in content.split(";\n"): if line.count("window.POST_DATA"): script_lines.append(line.replace("window.POST_DATA = ", "")) if len(script_lines) > 1: raise PresserJavaScriptParseError("More POST_DATA extracted than expected") if not script_lines: raise PresserJavaScriptParseError("No POST_DATA extracted for id {}".format(vine_id)) script_line = script_lines[0].replace("POST = ", "") try: data = execjs.eval(script_line) vine = data[vine_id] return vine except execjs.RuntimeError as e: error_message = "Problem with parsing, check parsing logic. {}".format(e) raise PresserJavaScriptParseError(error_message) elif page.status_code == 404: raise Presser404Error("{} could not be found".format(page.url)) else: raise PresserURLError("{} could not be accessed {} - {}".format(page.url, page.status_code,page.content))
def extract_filer_info_from_fulltext(filings): for idx,filing in enumerate(filings): soup = BeautifulSoup(filing['fulltext'],'lxml') soup = soup.find('sec-header') soup = soup.get_text() header = {'ITEM INFORMATION':[]} sub_head = {'flag':0} for line in soup.split('\n'): if ':' not in line: sub_head = {'flag':0} continue k = line.split(':')[0].strip() v = line.split(':')[1:] v = u''.join(list(v)).strip() if v == '': sub_head['flag'] = 1 sub_head['value'] = k header[k] = {} elif sub_head['flag']: header[sub_head['value']][k] = v elif k == 'ITEM INFORMATION': header[k].append(v) else: header[k] = v filings[idx] = dict(filing.items() + header.items()) return filings
def get_file_by_url(url): """ Get a file data located at a particular URL. Parameters ---------- url : str The URL at which to access the data. Returns ------- url_data : str or None The data retrieved at that URL from the file. Returns None if the attempted retrieval is unsuccessful. Note ---- - BeautifulSoup is used in this case to avoid having to search in which format we have to encode or decode data before parsing it to UTF-8. """ try: f = urlopen(url) soup = BeautifulSoup(f.read(), 'lxml').get_text() return '\n'.join(list(map(domain_to_idna, soup.split('\n')))) except Exception: print("Problem getting file: ", url)
def nortonrate(u,logs=True,returning=False,timeout=15,proxy=None): ''' this function takes any giving and gives a security report from: safeweb.norton.com, if it is a: spam domain, contains a malware... it takes 3 arguments: u: the link to check logs: (set by default to: True) showing the process and the report, you can turn it off by setting it to:False returning: (set by default to: False) returning the report as a string format if it is set to: True. usage: >>>import bane >>>url='http://www.example.com' >>>bane.nortonrate(domain) ''' if proxy: proxy={'http':'http://'+proxy} s="" try: if logs==True: print'[*]Testing link with safeweb.norton.com' ur=urllib.quote(u, safe='') ul='https://safeweb.norton.com/report/show?url='+ur c=requests.get(ul, headers = {'User-Agent': random.choice(ua)},proxies=proxy,timeout=timeout).text soup = BeautifulSoup(c, "html.parser").text s=soup.split("Summary")[1].split('=')[0] s=s.split("The Norton rating")[0].split('=')[0] if logs==True: print'[+]Report:\n',s.strip() except: pass if returning==True: return s.strip()
def review_to_words(raw_review, remove_stopwords = False): # BeautifulSoup pulls data out of html file # here it removes html tags and markups text = BeautifulSoup(raw_review).get_text() # replace numbers by word number text=re.sub(r'[0-9]+','number',text) # remove punctuations (they can be analyzed for better results) text = re.sub(r'[^a-zA-Z]', ' ', text) text = text.lower() #make a list of words words_list = text.split() #download nltk text data sets, including stop words #nltk.download() if remove_stopwords: # get stopwords, searching a set is faster than searching a list stops = set(stopwords.words('english')) # remove stopwords words_list = [word for word in words_list if not word in stops] # reduce words to their stems stemmer=PorterStemmer() words_list=[stemmer.stem(word) for word in words_list] # return the list of words return words_list
def gsearch( pages, Query ): Query = str( Query ) Query = str( sub(' ', '+', Query ) ) base = 'https://www.youtube.com' url = str( base + '/results?search_query=' + Query ) print( url ) br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) br.set_handle_equiv(False) br.addheaders = [('User-agent', UAS)] data = br.open( url ) soup = BeautifulSoup( data.read() ) gparse( soup ) if pages > 1: i = 2 soup = str( soup ) for a in soup.split('href='): a = str( a ) strn = str( 'page=' + str( i ) ) if strn in a: b = str( a.split('"')[1] ) b = str( sub('&','&',b) ) url = str( base + b ) Next( url, i, pages ) break
def gsearch( pages, Query ): url = 'https://encrypted.google.com' br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_equiv(False) br.addheaders = [('User-agent', UAS)] br.open( url ) br.select_form(name='f') br.form['q'] = Query data = br.submit() soup = BeautifulSoup(data.read()) gparse( soup ) if pages > 1: soup = str( soup ) for a in soup.split('"'): aye = str( 'start=10' ) if aye in str(a): url = str( url + str(a) ) Next( url, 1, pages ) nrslt = [] global grslt lg = len( grslt ) for e in range(1, lg): g = str( grslt[e] ) e = str( e ) z = str( e + ' | google | ' + g ) nrslt.append(z) return nrslt
def _old_parse_article_html(self, objectId, title, industry_press=None): df = Google().search("{0} site:marketwired.com".format(title)) html = Google().cache(df.link.tolist()[0]) article = BeautifulSoup(html).find("div",{"class":"mw_release"}) article = article.text if article else None #company_name = BeautifulSoup(html).find("span",{"itemprop":"name"}) company_name = BeautifulSoup(html).find("strong") company_name = company_name.split("SOURCE:")[-1] if company_name else None #q.enqueue(ClearSpark()._bulk_company_info, company_name) links, website = [], None for a in BeautifulSoup(html).find_all("a"): if "href" not in a.attrs: continue href = a["href"].lower() if "http" not in href: continue elif "marketwire" in href: continue elif "javascript" in href: continue elif "linkedin" in href: continue elif "twitter" in href: continue elif "youtube" in href: continue elif "flickr" in href: continue elif "facebook" in href: continue elif "google" in href: continue elif "addthis" in href: continue elif "sysomos" in href: continue if "target" in a.attrs: website = a["href"] links.append(href.strip()) info = {"article": article, "company_name": company_name, "website":website, "links":links} return info
def bsearch( pages, Query ): baselink = 'https://bing.com' br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) br.addheaders = [('User-agent', UAS)] r = br.open(baselink) html = r.read() br.select_form(nr=0) br.form['q'] = Query br.submit() soup = BeautifulSoup(br.response().read()) for item in (soup.select("h2")): plink(item) for item in(soup.select("a")): plink(item) if pages > 1: soup = str( soup ) for b in soup.split('"'): b = str( b ) if 'PORE' in b: url = str( baselink + str(b) ) url = str( unquote(url ) )#.decode('utf-8') ) Next( url, 1, pages ) nrslt = [] global brslt lg = len( brslt ) for e in range(1, lg): g = str( brslt[e] ) e = str( e ) z = str( e + ' | bing | ' + g ).decode('utf-8') nrslt.append(z) return nrslt
def cleanReview(path): data = [] for f in os.listdir(path): filePath = os.path.join(path, f) with open(filePath, 'r', encoding ='utf-8') as theFile: data.append(theFile.read().lower()) for i in range(len(data)): review = data[i] # Remove HTML tags review = BeautifulSoup(review, "html.parser").get_text() # Remove non alphanumerics review = re.sub('[^a-zA-Z0-9]', ' ', review) # Tokenize tokens = review.split() # Remove stop words #stops = set(stopwords.words("english")) #tokens = [w for w in tokens if not w in stops] # Remove empty strings tokens = filter(None, tokens) review = ( " ".join(tokens)) data[i] = review return data
def ysearch( pages, Query ): url = 'https://search.yahoo.com' br = mechanize.Browser(factory=mechanize.RobustFactory()) br.set_handle_robots(False) br.addheaders = [('User-agent', UAS)] r = br.open( url ) html = r.read() br.select_form(nr=0) br.form['p'] = Query br.submit() soup = BeautifulSoup(br.response().read()) gparse( soup ) if pages > 1: soup = str( soup ) for e in soup.split('<a'): e = str( e ) if 'class="next"' in str(e): url = str( sub(' class="next" href=', '', e ) ) url = str( url.split('"')[1] ) url = str( unquote(url) ).decode() # url = str( sub( '&', '&', url ) ) Next( url, 1, pages ) nrslt = [] global yrslt i = 1 lg = len( yrslt ) for e in range(1, lg): g = str( yrslt[e] ) e = str( e ) z = str( e + ' | yahoo | ' + g ) nrslt.append(z) return nrslt
def clean_htm_file(original_dir,cleaned_dir,file): original_content = open(os.path.join(original_dir,file)) original_lines = original_content.readlines() original_len = len(original_lines) original_size = os.path.getsize(os.path.join(original_dir,file)) # Make Line if float(original_len)/float(original_size) <= 0.0008: original_content = open(os.path.join(original_dir,file)) unformatted_content = BeautifulSoup(original_content,'lxml') formatted_content = unformatted_content.prettify() untagged_content = BeautifulSoup(formatted_content, 'html.parser').get_text().encode('utf8') original_content.close() # formatted_file = open(os.path.join(os.path.join(cleaned_dir, 'Manual_Clean'),file.split('.')[0] + '.txt'),'w') formatted_file = open(os.path.join(cleaned_dir,file.split('.')[0] + '.txt'),'w') untagged_content = untagged_content.split('\n') for uc in untagged_content: if '\xc3' or '\xbd' or '\xc2' or '\xa0' or '\xe2' or '\x80' or '\x99' or '\x9c' or '\x9d' or '\x94' or '\x97' or '\xa6' or '\xa2' or'\'' or '\x96' or '\xb7' or '\x92' or '\x93' or '\x8f' or '\x95' in uc: uc = uc.replace('\xc3','').replace('\xbd','').replace('\xc2','').replace('\xa0','').replace('\xe2','').replace('\x80','').replace('\x99','').replace('\x9c','').replace('\x9d','').replace('\x94','').replace('\x97','').replace('\xa6','').replace('\xa2','').replace("\'",r"'").replace('\x96','').replace('\xb7','').replace('\x92','').replace('\x93','').replace('\x8f','').replace('\x95','') if uc != '' and not uc.isspace(): formatted_file.write(uc.lstrip() + '\n') formatted_file.close() # Use the BeautifulSoup to clean the tags else: original_content = open(os.path.join(original_dir, file)) untagged_content = BeautifulSoup(original_content,'html.parser').get_text().encode('utf8') original_content.close() untagged_file = open(os.path.join(cleaned_dir,file.split('.')[0] + '.txt'),'w') untagged_content = untagged_content.split('\n') for uc in untagged_content: if '\xc3' or '\xbd' or '\xc2' or '\xa0' or '\xe2' or '\x80' or '\x99' or '\x9c' or '\x9d' or '\x94' or '\x97' or '\xa6' or '\xa2' or'\'' or '\x96' or '\xb7' or '\x92' or '\x93' or '\x8f' or '\x95' in uc: uc = uc.replace('\xc3','').replace('\xbd','').replace('\xc2','').replace('\xa0','').replace('\xe2','').replace('\x80','').replace('\x99','').replace('\x9c','').replace('\x9d','').replace('\x94','').replace('\x97','').replace('\xa6','').replace('\xa2','').replace("\'",r"'").replace('\x96','').replace('\xb7','').replace('\x92','').replace('\x93','').replace('\x8f','').replace('\x95','') if uc != '' and not uc.isspace(): untagged_file.write(uc + '\n') untagged_file.close()
def getartist(): try: #new request response = requests.get(stream_url, headers={'Icy-MetaData': 1}, stream=True) response.raise_for_status() headers, stream = response.headers, response.raw meta_int = headers.get('icy-metaint') meta_byte = stream.read(1) if (meta_byte): meta_length = ord(meta_byte) * 16 meta_data = stream.read() mymetadata = BeautifulSoup(meta_data, 'html.parser') mymetadata = str(mymetadata.get_text()) mymetadata = mymetadata.split("playing:",1)[1] mymetadata = (mymetadata.split("Support",1)[0]).rstrip() return mymetadata except KeyboardInterrupt: return "Failed"
def categories_files(): os.makedirs('./Categories_new/', exist_ok=True) for filename in os.listdir('./Categories'): with open('./Categories/' + filename) as text: ok_text = BeautifulSoup(text.read(), "lxml").get_text() sentences = ok_text.split('.') with open('./Categories_new/{}'.format(filename), 'w', encoding='utf-8') as new: for sent in sentences: new.write(sent+'.\n')
def bawe_file(): with open('new_corpus.txt', 'w', encoding='utf-8') as new: for filename in os.listdir('../CORPUS_TXT'): with open('../CORPUS_TXT/' + filename) as text: ok_text = BeautifulSoup(text.read(), "lxml").get_text() sentences = ok_text.split('.') for sent in sentences: new.write(sent+'.\n')
def get_latest_articles(): response = requests.get(ARXIV_URL) tree = ElementTree.fromstring(response.content) articles = [] for article_xml in tree.findall('{http://purl.org/rss/1.0/}item'): article = Article() # Parse title article.title, info = article_xml.find('{http://purl.org/rss/1.0/}title').text.strip()[:-1].rsplit('(',1) # We don't want updates if "UPDATED" in info: continue # Parse out identifier and categories article.identifier, article.subject = info.split()[0:2] # Clean up identifier article.identifier = article.identifier.replace('arXiv:', '').split('v')[0] # Parse authors authors = BeautifulSoup(article_xml.find('{http://purl.org/dc/elements/1.1/}creator').text, "html.parser").getText() # Clean up authors, by removing affiliations in potentially nested # parentheses. while True: start = None end = None paren = 0 for i in range(len(authors)): if authors[i] == '(': paren += 1 if paren == 1: start = i if authors[i] == ')': paren -= 1 if paren == 0: end = i break if start is None: break else: authors = authors[:start].strip() + " " + authors[end+1:].strip() article.authors = authors.split(', ') # Parse main text article.text = BeautifulSoup(article_xml.find('{http://purl.org/rss/1.0/}description').text, "html.parser").getText().strip() # Remove dollar signs article.text = article.text.replace('$', '') articles.append(article) return articles
def yql_real(tick, attempts): p = [] p.append(tick) p.append(attempts) # Web Scrapping try: req = Request( 'http://finance.yahoo.com/d/quotes.csv?s=' + tick + '&f=b2b3c6ej3m2r2j1', data=None, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' } ) html = urlopen(req) data = html.read() # Parsing soup = BeautifulSoup(data, 'html.parser') except URLError as e: writelog('[CRITICAL] URL ERROR Encountered' + str(e), 'yql_real', p) return 1 except HTTPError as e: writelog('[WARNING] HTTP ERROR Encountered ' + str(e), 'yql_real', p) return 1 except http.client.IncompleteRead as e: writelog('[WARNING] HTTP INCOMPLETE ERROR', 'yql_real', p) if (attempts < 3): r = yql_growth(tick, attempts + 1) else: writelog('[CRITICAL] HTTP INCOMPLETE ERROR AFTER 3 TRIES', 'yql_real', p) return 1 if (r == 0): return 0 else: writelog('[CRITICAL] HTTP INCOMPLETE READ ERROR - Unable to resolve', 'yql_real', p) return 1 # Remove subscripts for tag in soup.find_all('sup'): tag.replaceWith('') soup = str(soup) ts = soup.split(',') # Delete Row dquery = 'DELETE FROM yql_real WHERE tick = \'' + tick + '\'' dbquery(dquery) # Insert Row iquery = 'INSERT INTO yql_real (tick, ask, bid, rchange, es, marketcap, dayr, pe, smc) VALUES (\'' + tick + '\',' for ele in ts: iquery = iquery + '\'' + ele + '\', ' iquery = iquery[:-2] + ')' dbquery(iquery) return 0
def preproc(review, use_stopwords=False): review_text = BeautifulSoup(review, "lxml").get_text() review_text = re.sub("[^a-zA-Z]"," ", review_text) if use_stopwords: stops = set(nltk.stopwords.words("english")) words = [w for w in review_text.split() if not w in stops] return " ".join(words) return review_text.lower()
def cleanUpText(review): #this is by no means exauhstive punctuation = """.,?!:;(){}[]""" #remove html tags review_text = BeautifulSoup(review).get_text() #replace '\n' with '' review_text = review_text.replace('\n', '') #treat punctuation as a individual word for c in punctuation: review_text = review_text.replace(c," %s "%c) return review_text.split()
def url_title(message, user, target, text): match = url_regex.match(text) if match: if not chanconfig.get_config_key(target, "url"): return url = match.group(0) title = BeautifulSoup(requests.get(url).content).title.string title = " ".join(title.split()) if len(title) > 200: title = title[:200] + "..." message.client.say(target, "{}: {}".format(user.nick, title))
def set(self, **kwargs): if not kwargs.get("set", False): kwargs = self.preprocess(**kwargs) self.subtitle = kwargs["subtitle"] self.content = kwargs["content"] self.custom_div = kwargs["custom_div"] self.word_count = 0 if self.content: words = BeautifulSoup(self.content, "html.parser").get_text() self.word_count = len(words.split()) super(Text, self).set(**kwargs) self.postprocess(**kwargs)
date = ['201810170CGY', '201810270BOS'] #length of 14 game_list = [] for dates in date: url = f"https://www.hockey-reference.com/boxscores/{dates}.html#all_scoring" games_html = urlopen(url) #games_html = open(f"C:\\Users\\dbge\\OneDrive - Chevron\\Random\\{dates}.html") games_soup = BeautifulSoup(games_html, 'lxml') table = games_soup.find('table') rows = table.findAll('tr') str_cells = str(rows) cleantext = BeautifulSoup(str_cells, 'lxml').get_text() s = cleantext.split(',') for i in range(1, len(s)): s[i] = s[i].replace("\t", "") df = pd.DataFrame(s) df[0] = df[0].str.split('\n') tags = df[0].apply(pd.Series) tags = tags.rename(columns=lambda x: 'tag_' + str(x)) df = pd.concat([df[:], tags[:]], axis=1) df_width = len(df.columns) cleanedDF = PPandRegularGoals(df, dates) game_list.append(cleanedDF)
#VALORI DALLA PAGINA DETTAGLIO if 1: r = requests.get(base_url + link_dettaglio, timeout=5) responce = r.content soup_dettaglio = BeautifulSoup(responce) box = soup_dettaglio.find_all( "div", attrs={"class": "padding_box"}) righe = str(box[0]).split('<br/>') lista_scorporabile = [] for riga in righe: soup_contenuto = BeautifulSoup(riga).get_text().strip() provenienza = 'SCP' if 'CIG:' in soup_contenuto: cig = soup_contenuto.split(':')[1].strip() identificativo_gara = cig + '_' + provenienza if 'Descrizione' in soup_contenuto: oggetto = soup_contenuto.split( ':')[1].strip().replace('"', "'") oggetto = pulisci(oggetto) if 'Procedura' in soup_contenuto: procedura = soup_contenuto.split( ':')[1].strip().replace('"', "'") if 'Denominazione' in soup_contenuto: ente = soup_contenuto.split( ':')[1].strip().replace('"', "'") ente = pulisci(ente)
def review_to_wordlist(review, remove_stopwords=False): clean_sentence = [] review = review.lower() review = review.replace("# ", "#") review = review.replace("@ ", "@") review = review.replace(" _ ", "_") review = review.replace(" __ ", "") review = review.replace("__ ", "__") review = review.replace("_ ", "_") review = review.replace(' ’ s ', ' is ') review = review.replace(' ’ m ', ' am ') review = review.replace(' ’ re ', ' are ') review = review.replace("’ ll", 'will') review = review.replace("i'm", 'i am') review = review.replace("you'll", 'you will') review = review.replace("don't", 'do not') review = review.replace("can't", "can not") review = review.replace("it's", "it is") review = review.replace("she's", "she is") review = review.replace("let's", "let us") review = review.replace("i'll", "i will") review = review.replace("haven't", "have not") review = review.replace("doesn't", "does not") review = review.replace("he's", "he is") review = review.replace("doesn ’ t", "does not") review = review.replace("didn ’ t", "did not") review = review.replace("i ’ ve", "i have") review = review.replace("we'll", "we will") review = review.replace("i ’ d", "i had") review = review.replace("won ’ t", "would not") review = review.replace("we ’ ve", "we have") review = review.replace("you ’ ve", "you are") review = review.replace("ain ’ t", "are not") review = review.replace("y ’ all", "you and all") review = review.replace("couldn ’ t", "could not") review = review.replace("haven ’ t", "have not") review = review.replace("aren't", "are not") review = review.replace("you ’ d", "you had") review = review.replace("that's", "that is") review = review.replace("wasn't", "was not") review = review.replace("he'll", "he will") review = review.replace("ma ’ am", 'madam') review = review.replace("ma'am ", "madam") review = review.replace("they ’ ve", "they have") review = review.replace('don ’ t', 'do not') review = review.replace('can ’ t', 'can not') review = review.replace('isn ’ t', 'is not') review = review.replace("b'day", 'birthday') review = review.replace("I've", 'I have') review = review.replace("didn't", "did not") review = review.replace("u're", "you are") review = review.replace("What's", 'what is') review = review.replace("you're", 'you are') review = review.replace("You're", 'you are') review = review.replace("I'm", 'I am') review = review.replace("isn't", "is not") review = review.replace(" ___", "___ ") review = review.replace("won't", 'will not') review = review.replace('can ’ t', 'can not') review = review.replace('I ’ ll ', 'I will') review = review.replace("we ’ ll", 'we will') review = review.replace("didn ’ t", 'did not') review = review.replace(" u ", ' you ') review = review.replace("wasn ’ t", 'was not') review = review.replace(' ’ s ', ' is ') review = review.replace(' ’ m ', ' am ') review = review.replace(' ’ re ', ' are ') review = review.replace("’ ll", 'will') review = review.replace('don ’ t', 'do not') review = review.replace('can ’ t', 'can not') review = review.replace('isn ’ t', 'is not') review = review.replace("I've", 'I have') review = review.replace("What's", 'what is') review = review.replace("you're", 'you are') review = review.replace("You're", 'you are') review = review.replace("I'm", 'I am') review = review.replace("won't", 'will not') review = review.replace('can ’ t', 'can not') review = review.replace("we ’ ll", 'we will') review = review.replace("didn ’ t", 'did not') review = review.replace(" u ", ' you ') review = review.replace("wasn ’ t", 'was not') # review = review.replace(' ’ re', 'are') review = review.replace('+', 'and') review_text = BeautifulSoup(review, "lxml").get_text() # review_text = _slang_loopup(review_text) review_text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(review_text)) # print(review_text) review_text = review_text.split() for i in review_text: if i.startswith("@"): i = 'user' i.startswith("") if i.startswith("https"): continue if i.startswith("RT"): continue else: clean_sentence.append(i) review_text = ' '.join(str(i) for i in clean_sentence) review_text = re.sub("[^a-zA-Zn?!.]", " ", review_text) words = review_text.lower().split() orig_rev = ' '.join(words).lower() return (orig_rev)
import collections language = "english" search = "I" f = codecs.open("index.html", 'r', 'utf-8') def frequency(arr): return collections.Counter(arr) # the html parser is specified to ensure uniformity in all systems document = BeautifulSoup(f.read(), features="html.parser").get_text() #removing duplicated spaces and puntctiontions text = " ".join(document.split()) text = text.translate(str.maketrans('', '', string.punctuation)) #unfortunate this quotes resisted text = text.replace('”', '') text = text.replace(' “', '') stop_words = set(stopwords.words(language)) word_tokens = word_tokenize(text) filtered_sentence = [w for w in word_tokens if not w in stop_words] # print (filtered_sentence) freq = frequency(word_tokens) for key, value in freq.items():
return self._stemmer.stem(word).lower() grail = nltk.corpus.webtext.words('grail.txt') text = IndexedText(porter, grail) text.concordance('lie') #lemmatizer - making sure resulting word is in the dictionary then remove affixes wnl = nltk.WordNetLemmatizer() print([wnl.lemmatize(t) for t in raw_tokens]) #simple approaches to tokenizing text raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone though), 'I won't have any pepper in my kitchen AT ALL. Soup does very well without--Maybe it's always pepper that makes people hot-tempered,'...""" #easiest approach is to split by ' ', leaves out \n and tabs print(raw.split(' ')) print(re.split(r'[ \t\n]+', raw)) #notice the space print(re.split(r'\s+', raw)) #includes any white space character print(re.split(r'\w+', raw)) #try 'xx'.split('x') print(re.findall(r'\w+', raw)) #why does this happen? print( re.split(r'\W+', raw) ) #complement of \w, all characters other than letters, digits and underscores print( re.findall(r'\w+|\S\w*', raw) ) #first match sequence of word chars, if no match try to match any non-whitespace character(complement of \s) followed by other word characters print( re.findall(r'\w+([-\']\w+)*', raw) ) #permit word internal hyphens and apostrophes, this expression means \w+ followed by zero or more instances of [-']\w+ print(re.findall(r'\w+(?:[-\']\w+)*', raw)) print(
><span style="font-size: 11px; color: #ff8400">opinions – </span ><span style="font-size: 11px; color: #ff8400">enthusiast – </span ><span style="font-size: 11px; color: #ff8400">content – </span ><span style="font-size: 10px; color: #ff8400">founder – </span ><span style="font-size: 10px; color: #ff8400">building – </span ><span style="font-size: 10px; color: #ff8400">gwent – </span ><span style="font-size: 10px; color: #ff8400">speaker – </span ><span style="font-size: 9px; color: #ff8400">student – </span ><span style="font-size: 9px; color: #ff8400">react – </span ><span style="font-size: 9px; color: #ff8400">tweets</span> <p></p> <hr /> <p></p> <b>Two word bio cloud</b> <p></p> <span style="font-size: 60px; color: #ff8400">software engineer – </span ><span style="font-size: 27px; color: #ff8400">web developer – </span ><span style="font-size: 21px; color: #ff8400">software developer – </span ><span style="font-size: 20px; color: #ff8400">frontend developer – </span ><span style="font-size: 19px; color: #ff8400">content creator – </span ><span style="font-size: 15px; color: #ff8400">web dev – </span ><span style="font-size: 15px; color: #ff8400">full stack – </span ><span style="font-size: 10px; color: #ff8400">web development – </span ><span style="font-size: 9px; color: #ff8400">html css</span> </div> """ soup = BeautifulSoup(path, features="lxml").get_text( strip=True) # .find_all("div", {"class": "slice_body_1"}) span = soup.split('–') print(soup)
def extract_url(self, url): response = requests.get(url) title = BeautifulSoup(response.content).title.text title_url = "http://%s" % title.split("://")[1].strip() return title_url
def crawl_cveid_list(): link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=1&hasexp=0&opdos=1&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=21357&sha=38745b427397c23f6ed92e0ed2d3e114da828672' max_page_idx_list = [] for i in range(13): cat_list = ['0'] * 13 cat_list[i] = '1' dos, execution, overflow, memc, sqli, xss, dirtra, httprs, bypass, infor, gainpre, csrf, fileinc = cat_list page_num = 1 link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=' + str( page_num) + '&hasexp=0&opdos=' + dos + '&opec=' + execution + '&opov=' + overflow + '&opcsrf=' + csrf + '&opgpriv=' + gainpre + '&opsqli=' + sqli + '&opxss=' + xss + '&opdirt=' + dirtra + '&opmemc=' + memc + '&ophttprs=' + httprs + '&opbyp=' + bypass + '&opfileinc=' + fileinc + '&opginf=' + infor + '&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=28068&sha=0ea5fbc52190c28f2a1c51aca205b315bc4c6509' page = requests.get(link, timeout=60, headers={'User-Agent': "Magic Browser"}) print(link) # print(dos, ec, ov, csrf, gpriv, sqli, xss, dirt, memc, httprs, byp, fileinc, inf) content = BeautifulSoup(page.content).get_text() keyword_section = content.replace('\n', ' ') loc_1 = keyword_section.find('This Page)') loc_2 = keyword_section.find('How does it work? ') max_page_idx = keyword_section[loc_1 + 10:loc_2].split(' ')[0].strip().split()[-1] print(max_page_idx) max_page_idx_list.append(max_page_idx) name_cat = ['dos', 'execution', 'overflow', 'memc', 'sqli', 'xss', 'dirtra', 'httprs', 'bypass', 'infor', 'gainpre', 'csrf', 'fileinc' ] sha_value_cat = ['38745b427397c23f6ed92e0ed2d3e114da828672', '0ea5fbc52190c28f2a1c51aca205b315bc4c6509', '363372bbc3616054065946a39f4fa589eb5f0f04', '5829c45b747ab5143004640f312c7f72e5b102db', '1b24fccb15090079e49c0131be821c96dc2f001c', 'e3bb5586965f5a13bfaa78233a10ebc3f9606d12', '69098b0b30799b9520bf468c7bc060a7f756abf9', 'd5623136f5150876a7dfba54b38fc96fe135993c', '7c71486574161a851e392e2e9dcdfea2cde521c3', '1f368a2d3fc25689cc46e4dcb206b4d6103aaab7', '2f1f77e26ecf09cf8b4f251b1efc2b4bcad02050', 'e2c3963a5b4ac67dc5dc9fe39ff95f846162e52d', '4160b1b268ed8bcd97bdd927802ef4922995d3d2'] CVE_id_list_by_cat = [] try: for cat_idx in range(13)[0:]: cat_list = ['0'] * 13 cat_list[cat_idx] = '1' sha_value = sha_value_cat[cat_idx] dos, execution, overflow, memc, sqli, xss, dirtra, httprs, bypass, infor, gainpre, csrf, fileinc = cat_list # cat_list[cat_idx] = '1' max_page_num = int(max_page_idx_list[cat_idx]) print('crawling the CVE ids in the ' + str(cat_idx) + ' category...') CVE_id_list_this_cat = [] page_num = 1 cve_cnt = 0 while page_num <= max_page_num: link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=' + str( page_num) + '&hasexp=0&opdos=' + dos + '&opec=' + execution + '&opov=' + overflow + '&opcsrf=' + csrf + '&opgpriv=' + gainpre + '&opsqli=' + sqli + '&opxss=' + xss + '&opdirt=' + dirtra + '&opmemc=' + memc + '&ophttprs=' + httprs + '&opbyp=' + bypass + '&opfileinc=' + fileinc + '&opginf=' + infor + '&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=28068&sha=' + sha_value page = requests.get(link, timeout=60, headers={'User-Agent': "Magic Browser"}) print('category ' + str(cat_idx) + ', page ' + str(page_num) + ', cve count ' + str(cve_cnt), link) content = BeautifulSoup(page.content).get_text() content_lines_list = content.split('\n') for line in content_lines_list: if line.startswith('CVE-'): CVE_id_list_this_cat.append(line.strip()) cve_cnt += 1 page_num += 1 CVE_id_list_by_cat.append(CVE_id_list_by_cat) f_cve_id_cat_file = open('../data/cvedetails_dict/cvedetails_dict' + name_cat[cat_idx], 'w') idx = 1 for cve in CVE_id_list_this_cat: f_cve_id_cat_file.write(str(idx) + '\t' + cve + '\n') idx += 1 print(CVE_id_list_by_cat) except requests.exceptions.HTTPError as errh: print("Http Error: " + str(errh) + " Please check: " + link) except requests.exceptions.ConnectionError as errc: print("Error Connecting:" + str(errc) + " Please check: " + link) except requests.exceptions.Timeout as errt: print("Timeout Error:" + str(errt) + " Please check: " + link) except requests.exceptions.RequestException as err: print("Other errors!" + str(err) + " Please check: " + link)
import requests from bs4 import BeautifulSoup usrprice = int(input("Enter buy-in price: \n")) while True: try: lst = [] url = 'https://data.gateio.life/api2/1/ticker/btc_usdt' res = requests.get(url) html_page = res.content soup = BeautifulSoup(html_page, 'html.parser').text soup.split(',') lst.append(soup) parts = soup.split(',') lask = parts[5] try1 = float(lask[13:20]) try2 = float(lask[13:18]) print("Percent Change: ",float(((float(try1)) - float(usrprice)) / usrprice)*100,"%") print("Current Price: ",try1,"$") print(" ") except Exception as e: print("Percent Change: ",float(((float(try1)) - float(usrprice)) / usrprice)*100,"%") print("Current Price: ",try2,"$") print(" ") if try1 or try2 != float: continue
def main(): # webdriver options options = webdriver.ChromeOptions() options.add_argument('--ignore-certificate-errors') options.add_argument('--incognito') options.add_argument('--headless') # initialize webdriver driver = webdriver.Chrome(executable_path=binary_path, options=options) driver.get(URL) # initialize csv outfile = open('reviews.csv', 'w', newline='') writer = csv.writer(outfile) writer.writerow(["author", "rating", "date", "review"]) for x in range(PAGES): # Wait 4 seconds for the DOM to update time.sleep(4) # Grab new DOM soup = BeautifulSoup(driver.page_source, 'html.parser') # Each user review reviews_selector = soup.find_all('div', class_='profile-user-review') for review_selector in reviews_selector: # Grab all data author = review_selector.find('span', attrs={ 'itemprop': 'author' }).get_text() rating = review_selector.find('span', attrs={ 'itemprop': 'reviewRating' }).get_text() date = review_selector.find('span', attrs={ 'itemprop': 'datePublished' }).get_text() review = review_selector.find('span', attrs={ 'itemprop': 'reviewBody' }).get_text() # Clean whitespace author = ''.join(content.strip() for content in author.split('/n')) rating = ''.join(content.strip() for content in rating.split('/n')) date = ''.join(content.strip() for content in date.split('/n')) review = BeautifulSoup( ''.join(content.strip() for content in review.split('/n')), 'html.parser') # Save all data writer.writerow([author, rating, date, review]) print("✅ Page " + str(x + 1) + "'s Page reviews saved\n") try: # Find next button on reviews nextButton = driver.find_element_by_class_name("next") # Click next button nextButton.click() print("🖱 Mouse click to next review page\n") except Exception as error: print(error) print("🚨 Unable to find buttons to click next") sys.exit(errorCodes.unable_to_identify_button) print("🙌🏼 Reviews saved to reviews.csv\n")
''' SELECT p.documentName AS docName, SUM(frequency) AS freq, GROUP_CONCAT(indexes) AS idxs FROM Posting p WHERE p.word = ? GROUP BY p.documentName ORDER BY freq DESC; ''', (token, )) for row in cursor: namea = row[0] pageContent = open(namea, 'r', encoding="utf-8").read() pageContent = BeautifulSoup(pageContent).find('body') pageContent = pageContent.text.lower() lines = pageContent.split("\n") non_empty_lines = [line for line in lines if line.strip() != ""] pageContent = "" for line in non_empty_lines: pageContent += line + "\n" text_tokens = word_tokenize(pageContent) inde = "" position = [] repeat = [] lila = [] i = 0 j = 0 x = 0 for coun in text_tokens: for coun2 in text_tokens: if (coun == coun2):
def in_avtor(url_adress): html = urlopen(url_adress).read().decode('cp1251') baze_parsing = {} # Список по странице key_str = [ 'URL', 'ФИО', 'Название', 'Aдpeс', 'WWW', 'Родился', 'Живет', 'Обновлялось', 'Объем', 'Рейтинг', 'Посетителей', 'Friend', 'Страна', 'Город', 'Кол_во', 'Кол_Оценок', 'Friend_on', 'Friend_off' ] baze_parsing['URL'] = url_adress try: bs = BeautifulSoup(html, "html.parser").h3.text baze_parsing['ФИО'] = bs.split(':\n')[0] baze_parsing['Название'] = bs.split(':\n')[1] except (IndexError, AttributeError): baze_parsing['ФИО'] = '' baze_parsing['Название'] = '' try: start_txt = BeautifulSoup(html, "html.parser").li.text except (AttributeError): start_txt = ('\n') mas = [] # Массив значений полученных с сайта for elem in str(start_txt).split('\n'): # Чистим массив от лишних знаков if '\r' or ' ' or '\t' in elem: mas.append(elem.strip('\r, ,\t')) else: mas.append(elem) # Посмотреть другую логику (через ключи) for key in key_str[3:]: if key in " ".join(mas): for el in mas: if key in el: try: baze_parsing[key] = el.split(': ')[1] except IndexError: baze_parsing[key] = '' break else: baze_parsing[key] = '' # ------------ Убираем знаки, сбивающие запись в SQL ------------- if "'" or "?" in str(baze_parsing): for key in key_str[:]: if "'" in baze_parsing[key]: # Убираем знаки ' baze_parsing[key] = baze_parsing[key].replace("'", "_") if "?" in baze_parsing[key]: # Убираем знаки ' baze_parsing[key] = baze_parsing[key].replace("?", "_") # ------------- Обработка строк для загрузки в базу ------------- if baze_parsing['Живет'] != '' or ',' in baze_parsing['Живет']: baze_parsing['Страна'] = baze_parsing['Живет'].split(',')[0] baze_parsing['Город'] = baze_parsing['Живет'].split(',')[1] baze_parsing['Родился'] = '-'.join( baze_parsing['Родился'].split('/')[::-1]) baze_parsing['Обновлялось'] = '-'.join( baze_parsing['Обновлялось'].split('/')[::-1]) if '/' in baze_parsing['Объем']: baze_parsing['Кол_во'] = baze_parsing['Объем'].split('/')[1] baze_parsing['Объем'] = baze_parsing['Объем'].split('k/')[0] if '*' in baze_parsing['Рейтинг']: baze_parsing['Кол_Оценок'] = baze_parsing['Рейтинг'].split('*')[1] baze_parsing['Рейтинг'] = baze_parsing['Рейтинг'].split('*')[0] if '/' in baze_parsing['Friend']: baze_parsing['Friend_on'] = baze_parsing['Friend'].split('/')[0] baze_parsing['Friend_off'] = baze_parsing['Friend'].split('/')[1] elif baze_parsing['Friend'] != '': baze_parsing['Friend_on'] = baze_parsing['Friend'] try: cursor.execute("""INSERT INTO 'Samizdat' ( 'URL', 'ФИО', 'Название', 'Aдpeс', 'WWW', 'Родился', 'Обновлялось', 'Объем', 'Рейтинг', 'Посетителей', 'Страна', 'Город', 'Кол_во', 'Кол_Оценок', 'Friend_on', 'Friend_off') VALUES ( '{URL:s}', '{ФИО:s}', '{Название:s}', '{Aдpeс:s}', '{WWW:s}', '{Родился:s}', '{Обновлялось:s}', '{Объем:s}', '{Рейтинг:s}', '{Посетителей:s}', '{Страна:s}', '{Город:s}', '{Кол_во:s}','{Кол_Оценок:s}', '{Friend_on:s}', '{Friend_off:s}' )""".format(**baze_parsing)) SQL_Connect.commit() # Применение изменений к базе данных except sqlite3.Error as e: print(e, '---------->', baze_parsing['ФИО'], baze_parsing['URL']) try: cursor.execute( "INSERT INTO 'URL_Error' ('URL') VALUES ('{:s}')".format( baze_parsing['URL'])) SQL_Connect.commit() # Применение изменений к базе данных except: print(e, '--------------------->', baze_parsing['URL'], '<------------------') return baze_parsing['ФИО']
csv_writer.writeheader() for row in csv_reader: title = row['title'].strip().lower() raw_content = row['raw_content'] clean_content = BeautifulSoup(raw_content, 'lxml').text content = [] labels = [] # Compile regex to remove non-alphanum char nonalpha = re.compile('[^a-z\-]+') for word in title.split(' '): lower_word = word.lower() clean_word = nonalpha.sub('', lower_word) if clean_word != '': content.append(clean_word) for word in clean_content.split(' '): lower_word = word.lower() clean_word = nonalpha.sub('', lower_word) if clean_word != '': content.append(clean_word) content_str = ' '.join(content).strip() if row['sentiment_1'] != '': label = row['sentiment_1'] labels.append(label) if row['sentiment_2'] != '': label = row['sentiment_2'] labels.append(label) if row['sentiment_3'] != '': label = row['sentiment_3'] labels.append(label)
def craw_detail(self, url, headers, cookies, it): time.sleep(random.uniform(1, 3000) / 1000) searchid = file_utils.url_parse(url, "SEARCHID") #print(url) session = requests.session() response_list = session.get(url, headers=headers, cookies=cookies) cookies = requests.utils.dict_from_cookiejar(session.cookies) headers = response_list.headers #print(response_list.text) response_list.close() if response_list.status_code == 200 and 'System error happened' not in response_list.text: title = "" accession_number = "" source_title = "" language = "" document_type = "" abstract = "" number_of_references = "" main_heading = "" controlled_terms = "" uncontrolled_terms = "" classification_code = "" doi = "" database = "" conference_name = "" conference_date = "" conference_location = "" conference_code = "" mumerical_data_indexing = "" affiliation_no = "" author_affiliation = "" affiliation_organization = "" country = "" authors = "" affiliation_no = "" e_mail = "" funding_number = "" funding_acronym = "" funding_sponsor = "" source_title = "" abbreviated_source_title = "" issn = "" e_issn = "" coden = "" isbn_13 = "" article_number = "" issue = "" volume = "" part_number = "" issue_title = "" issue_date = "" publication_year = "" page_begin = "" page_end = "" publisher = "" referance_no = "" referance_title = "" referance_authors = "" referance_source = "" list_json = response_list.json() results = list_json["results"] docindex = results[0].get("doc").get("hitindex") docid = results[0].get("doc").get("docid") # abstracthref = results[0]["abstracthref"].replace("\n","").replace(" ","") time.sleep(random.uniform(1, 3000) / 1000) abstracthref = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID=" + searchid + "&DOCINDEX=" + str( docindex ) + "&ignore_docid=" + docid + "&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes" #session = requests.session() # response = session.get(self.basd_url+abstracthref,headers=headers,cookies=cookies) headers["Content-Type"] = "application/json" # headers["Connection"] = "keep-alive" # headers["Referer"] = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID="+searchid+"&DOCINDEX="+str(docindex)+"&ignore_docid="+docid+"&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes" # abstract_response = session.get(abstracthref, headers=headers, cookies=cookies) # print(abstract_response.text) # abstract_json = abstract_response.json() # title = BeautifulSoup(abstract_json.get("abstractDetail_highlight_terms_map").get("title"),"lxml").text # ------------------------------------------------------detailed---------------------------------------------------------- time.sleep(random.uniform(1, 3000) / 1000) detailedhref = "https://www.engineeringvillage.com/search/doc/detailed.url?content=true&SEARCHID=" + searchid + "&DOCINDEX=" + str( docindex ) + "&database=1&pageType=expertSearch&searchtype=Expert&dedupResultCount=null&format=expertSearchDetailedFormat&usageOrigin=recordpage&usageZone=abstracttab" session = requests.session() detailed_response = session.get(detailedhref, headers=headers, cookies=cookies) #print(detailed_response.text) detailed_response.close() if detailed_response.status_code == 200: detailed_json = detailed_response.json() #print(detailed_json) detailed_result = detailed_json.get("result") title = BeautifulSoup( detailed_json.get("result").get("title"), "lxml").text.replace("'", "\\'").replace('"', '\\"') accession_number = detailed_result.get("accnum") author_affiliations = detailed_result.get("affils") source_title = detailed_result.get("ril") language = detailed_result.get("la") document_type = detailed_result.get("doctype") abstract = BeautifulSoup( detailed_json.get("abstractDetail_highlight_terms_map"). get("abstractRecord"), "lxml").text if detailed_json.get( "abstractDetail_highlight_terms_map").get( "abstractRecord") is not None else '' number_of_references = detailed_result.get( "abstractrecord").get("refcount") main_heading = '' if detailed_result.get("abstractrecord") is not None: if detailed_result.get("abstractrecord").get( "termmap") is not None: if detailed_result.get("abstractrecord").get( "termmap").get("MH") is not None: main_heading = detailed_result.get( "abstractrecord").get("termmap").get( "MH")[0].get("value") controlled_terms = BeautifulSoup( detailed_json.get("abstractDetail_highlight_terms_map"). get("CVS"), "lxml").text if detailed_json.get( "abstractDetail_highlight_terms_map").get( "CVS") is not None else '' uncontrolled_terms = BeautifulSoup( detailed_json.get("abstractDetail_highlight_terms_map"). get("FLS"), "lxml").text if detailed_json.get( "abstractDetail_highlight_terms_map").get( "FLS") is not None else '' # 具体解析 classification_code_tmp = detailed_result.get( "abstractrecord").get("classificationcodes").get( "Classification code") if classification_code_tmp is not None and len( classification_code_tmp) > 0: for cc in classification_code_tmp: classification_code = classification_code + cc.get( "id") + cc.get("title") + " - " classification_code = classification_code.rstrip(' - ') doi = detailed_result.get("doi") data_base = detailed_result.get("doc").get("dbname") conference_name = BeautifulSoup( detailed_result.get("cf"), "lxml" ).text if detailed_result.get("cf") is not None else '' conference_date = detailed_result.get( "md") if detailed_result.get("md") is not None else '' conference_location = detailed_result.get( "ml") if detailed_result.get("ml") is not None else '' conference_code = BeautifulSoup( detailed_result.get("cc"), "lxml").text.replace("\n", "").replace( "\t", "") if detailed_result.get("cc") is not None else "" mumerical_data_indexing = detailed_result.get( "ndi") if detailed_result.get("ndi") is not None else '' sqls = [] # ei_thesis_thesis tt_cauthors = detailed_result.get("cauthors") corresponding_author = "" corresponding_author_email = "" if tt_cauthors is not None and len(tt_cauthors) > 0: for cauthor in tt_cauthors: corresponding_author = corresponding_author + cauthor.get( "name") + ";" corresponding_author_email = corresponding_author_email + ( (cauthor.get("email") + ";") if cauthor.get("email") != '' is not None else '') id = str( uuid.uuid3(uuid.NAMESPACE_DNS, title + accession_number)) sql = "REPLACE INTO ei_thesis_thesis(id,title,accession_number,source_title,language,document_type,abstract,number_of_references,main_heading,controlled_terms,uncontrolled_terms,classification_code,doi,data_base,conference_name,conference_date,conference_location,conference_code,mumerical_data_indexing,corresponding_author,corresponding_author_email) " \ "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + language + "','" + document_type + "','" + abstract.replace("'", "\\'").replace('"', '\\"') + "','" + str(number_of_references) + "','" + main_heading + "','" + controlled_terms.replace("'", "\\'").replace('"', '\\"') + "','" + uncontrolled_terms.replace("'", "\\'").replace('"', '\\"') + "','" + classification_code + "','" + doi + "','" + data_base + "','" + conference_name.replace("'", "\\'").replace('"', '\\"') + "','" + conference_date + "','" + conference_location.replace("'", "\\'").replace('"', '\\"') + "','" + conference_code + "','" + mumerical_data_indexing + "','" + corresponding_author.replace("'", "\\'").replace('"', '\\"') + "','" + corresponding_author_email + "')" sqls.append(sql) # ei_thesis_affiliation if author_affiliations is not None and len( author_affiliations) > 0: for af in author_affiliations: author_affiliation = BeautifulSoup( af.get("name"), "lxml").text if af.get("name") is not None else '' aocs = author_affiliation.split(",") affiliation_organization = '' country = '' if len(aocs) == 5: affiliation_organization = aocs[-3] country = aocs[-1] elif len(aocs) == 4: affiliation_organization = aocs[-3] country = aocs[-1] elif len(aocs) == 3: affiliation_organization = aocs[-2] country = aocs[-1] id = str( uuid.uuid3( uuid.NAMESPACE_DNS, title + accession_number + str(af.get("id")))) sql = 'REPLACE INTO ei_thesis_affiliation(id,title,accession_number,affiliation_no,author_affiliation,affiliation_organization,country) ' \ 'VALUES ("' + id + '","' + title + '","' + accession_number + '","' + str(af.get("id")) + '","' + author_affiliation + '","' + affiliation_organization + '","' + country + '")' sqls.append(sql) # ei_thesis_author authors = detailed_result.get("authors") cauthors = detailed_result.get("cauthors") if authors is not None and len(authors) > 0: for au in authors: affiliation_no = au.get("id") author = au.get("name") e_mail = au.get("email") corresponding_author = '0' if cauthors is not None and len(cauthors) > 0: for cauthor in cauthors: if author == cauthor.get("name"): corresponding_author = "1" id = str( uuid.uuid3(uuid.NAMESPACE_DNS, title + accession_number + author)) sql = "REPLACE INTO ei_thesis_author(id,title,accession_number,author,affiliation_no,e_mail) " \ "VALUES ('"+id+"','"+title+"','"+accession_number+"','"+author.replace("'", "\\'").replace('"', '\\"')+"','"+str(affiliation_no)+"','"+e_mail+"')" sqls.append(sql) # ei_thesis_funding funding_details = detailed_result.get( "abstractrecord").get("fundingDetails") if funding_details is not None and len( funding_details) > 0: for fd in funding_details: id = str( uuid.uuid3( uuid.NAMESPACE_DNS, title + accession_number + str(fd.get("fundingId")))) sql = "REPLACE INTO ei_thesis_funding(id,title,accession_number,funding_number,funding_acronym,funding_sponsor) " \ "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(fd.get("fundingId")) + "','" + fd.get("fundingAcronym") + "','" + fd.get("fundingAgency").replace("'", "\\'").replace('"', '\\"') + "')" sqls.append(sql) # ei_thesis_publication abbreviated_source_title = detailed_result.get( "sourceabbrev") issn = detailed_result.get("citedby").get( "issn") if detailed_result.get("citedby").get( "issn") is not None else '' e_issn = detailed_result.get("abstractrecord").get( "eissn") if detailed_result.get("abstractrecord").get( "eissn") is not None else '' if e_issn is not None and e_issn != '': e_issn = e_issn[0:4] + "-" + e_issn[4:len(e_issn)] coden = detailed_result.get("abstractrecord").get( "coden") if detailed_result.get("abstractrecord").get( "coden") is not None else '' isbn_13 = detailed_result.get( "isbn13") if detailed_result.get( "isbn13") is not None else '' article_number = detailed_result.get( "articlenumber") if detailed_result.get( "articlenumber") is not None else '' issue = detailed_result.get("citedby").get("firstissue") volume = detailed_result.get("vo") part_number = detailed_result.get( "cfpnum") if detailed_result.get( "cfpnum") is not None else '' issue_title = detailed_result.get("mt").replace( "::H:", ":H::") issue_date = detailed_result.get("sd") publication_year = detailed_result.get("yr") pages = detailed_result.get("pages") page_begin = "" page_end = "" pages_split = pages.split("-") if len(pages_split) == 2: page_begin = pages_split[0] page_end = pages_split[1] publisher = detailed_result.get("pn").replace( "::H:", ":H::") id = str( uuid.uuid3(uuid.NAMESPACE_DNS, title + accession_number)) sql = "REPLACE INTO ei_thesis_publication(id,title,accession_number,source_title,abbreviated_source_title,issn,e_issn,coden,isbn_13,article_number,issue,volume,part_number,issue_title,issue_date,publication_year,page_begin,page_end,publisher) " \ "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + abbreviated_source_title.replace("'", "\\'").replace('"', '\\"') + "','" + str(issn) + "','" + str(e_issn) + "','" + str(coden) + "','" + str(isbn_13) + "','" + str(article_number) + "','" + str(issue) + "','" + volume + "','" + str(part_number) + "','" + issue_title.replace("'", "\\'").replace('"', '\\"') + "','" + issue_date + "','" + publication_year + "','" + page_begin + "','" + page_end + "','" + publisher.replace("'", "\\'").replace('"', '\\"') + "')" sqls.append(sql) # ------------------------------------------------------Compendex Refs------------------------------------------------------ # refs1,如果没有参考文献信息,detailed_result.get("abstractrecord").get("refcount")的值会为-1,否则就显示实际论文数 if number_of_references != -1: time.sleep(random.uniform(1, 3000) / 1000) refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&refType=compendex&searchtype=Expert&usageOrigin=recordpage&usageZone=detailedtab&pageType=expertSearch&SEARCHID=" + searchid + "&DOCINDEX=" + str( docindex ) + "&database=1&docid=" + docid + "&totalResultsCount=67010&displayPagination=yes&dbid=cpx" session = requests.session() refs_response = session.get(refshref, headers=headers, cookies=cookies) #print(refs_response.text) refs_response.close() if refs_response.status_code == 200: refs_json = refs_response.json() #print(refs_json) referenceBean = refs_json.get("referenceBean") title_authors = referenceBean.get("results") sources = referenceBean.get( "resultformat_abssourcelines") if title_authors is not None and len( title_authors) > 0: for index in range(0, len(title_authors)): referance_no = index + 1 referance_authors = "" t_authors = title_authors[index].get("authors") if t_authors is not None and len( t_authors) > 0: for tau in t_authors: referance_authors = referance_authors + tau.get( "name") + ";" referance_title = title_authors[index].get( "title").replace("'", "\\'").replace( '"', '\\"') referance_authors = referance_authors.replace( "'", "\\'").replace('"', '\\"') referance_source = BeautifulSoup( sources[index], "lxml").text.replace("'", "\\'").replace( '"', '\\"').replace('Source: ', '') id = str( uuid.uuid3( uuid.NAMESPACE_DNS, title + accession_number + referance_title)) sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \ "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')" sqls.append(sql) # resfs2 当refs条数大于25的时候才执行这一步,不然只有一页,没有下一页 if number_of_references > 25: time.sleep(random.uniform(1, 3000) / 1000) refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&compendexajax=t&docid=" + docid + "&SEARCHID=" + searchid + "&database=1&DOCINDEX=&currPageNumber=2&searchtype=Expert&pageSize=25" session = requests.session() refs_response = session.get(refshref, headers=headers, cookies=cookies) #print(refs_response.text) refs_response.close() refs_json = refs_response.json() #print(refs_json) referenceBean = refs_json.get("referenceBean") title_authors = referenceBean.get("results") sources = referenceBean.get( "resultformat_abssourcelines") if title_authors is not None and len( title_authors) > 0: for index in range(0, len(title_authors)): referance_no = index + 1 referance_authors = "" t_authors = title_authors[index].get("authors") if t_authors is not None and len( t_authors) > 0: for tau in t_authors: referance_authors = referance_authors + tau.get( "name") + ";" referance_title = title_authors[index].get( "title").replace("'", "\\'").replace( '"', '\\"') referance_authors = referance_authors.replace( "'", "\\'").replace('"', '\\"') referance_source = BeautifulSoup( sources[index], "lxml").text.replace("'", "\\'").replace( '"', '\\"').replace('Source: ', '') id = str( uuid.uuid3( uuid.NAMESPACE_DNS, title + accession_number + referance_title)) sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \ "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')" sqls.append(sql) #print(sqls) self.mysqlclient.insert_thesis_afoprt(sqls) else: self.redis_client.lpush(self.consumer_list_success_fail, json.dumps(it)) else: self.redis_client.lpush(self.consumer_list_success_fail, json.dumps(it))
def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) if add_terms: if prm.top_tfidf > 0 or prm.idf_path: print 'Creating IDF dictionary...' self.idf = defaultdict(int) doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): # remove html tags txt = BeautifulSoup( record.payload[:1000 * 1000], "lxml").get_text() # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_idf(txt) if doc_id % 1000 == 0: print 'Creating IDF, doc', doc_id doc_id += 1 for key, val in self.idf.items(): self.idf[key] = math.log(float(doc_id) / val) pkl.dump(self.idf, open(prm.idf_path, 'wb')) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 if docs_path.lower().endswith('.hdf5'): import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 else: # ClueWeb09 import warc import gzip from bs4 import BeautifulSoup # list all files in the folder. paths = [] for root, directories, filenames in os.walk(docs_path): for filename in filenames: paths.append(os.path.join(root, filename)) for path in paths: with gzip.open(path, mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): if 'warc-trec-id' in record: title = record['warc-trec-id'] else: title = record['warc-record-id'] # remove html tags #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text() txt = record.payload[:1000 * 1000] # remove WARC headers. txt = '\n'.join(txt.split('\n')[10:]) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
nltk.download('stopwords') from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords train_label = pd.read_csv("labeledTrainData.tsv", delimiter="\t", quoting=3) train_unlabel = pd.read_csv("unlabeledTrainData.tsv", delimiter="\t", quoting=3) test = pd.read_csv("testData.tsv", delimiter="\t", quoting=3) corpus = [] corpus2 = [] for i in range(0, 25000): review = re.sub('[^A-Za-z]', ' ', train_label['review'][i]) review = BeautifulSoup(review).get_text() review = review.lower() review = review.split() review3 = re.sub('[^A-Za-z]', ' ', test['review'][i]) review3 = BeautifulSoup(review3).get_text() review3 = review3.lower() review3 = review3.split() ps = PorterStemmer() review = [ ps.stem(i) for i in review if not i in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) review3 = [ ps.stem(i) for i in review3 if not i in set(stopwords.words('english')) ]
# Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height #%% for media_href in list(media_hrefs): driver.get(media_href) try: imgUrl = BeautifulSoup( driver.page_source , 'html.parser').select('.KL4Bh')[0].select('img')[0].get('srcset').split(',')[-1:][0][:-5] dateSrting = driver.find_element_by_tag_name('time').get_attribute('datetime')[:10] dateDirPath = '../data/' + dateSrting if not os.path.isdir(dateDirPath): os.makedirs(dateDirPath) print("Downloading images " + dateDirPath) urllib.request.urlretrieve( imgUrl , filename=os.path.join( dateDirPath , imgUrl.split('/')[-1:][0])) except: pass print("Finished") driver.close()
#!/usr/bin/python # _*_ coding:utf-8 _*_ import nltk import urllib from nltk import re from bs4 import BeautifulSoup from urllib import request import ssl #Abstract the information from the url url = input("Enter Your Website:\n") ssl._create_default_https_context = ssl._create_unverified_context html = request.urlopen(url).read() raw = BeautifulSoup(html).get_text() text_nopunct = [word for word in raw.split()] print(raw) raw = raw.replace(' ', '') #Get the telephone number pattern = re.compile( "((\+?55|0)\-?\s?[1-9]{2}\-?\s?[2-9]{1}\d{3,4}\-?\d{4}|(\0?\d{4,5})\s?\d{6}|\0\d{10}|(\+44|0044)\s?\d{10})" ) result = pattern.findall(str(html)) i = 0 print("Found a Match:") print(result[i]) #print(pattern)
lk['href']) schl_detl_soup = BeautifulSoup(schl_detl.text) contact = schl_detl_soup.find(class_='contact') school_nm = contact.h4.getText('|').split('|') t = contact.find_all('p')[0].text.split('\n') text = [] for te in t: text.extend(te.split('\xa0')) c = {} for t in text: if '' in t: text.remove(t) a = t.replace('\r', '') if a == '': continue c.update({a.split(':')[0]: a.split(':')[1]}) sir = contact.find_all('p')[1].getText(' ') sir = sir.split(':') e = sir[1].split(' ')[0] r = ''.join(sir[1].split(' ')[1:]) sir = {sir[0]: e, r: sir[2]} # print(sir.split('|')) c.update(sir) c.update({'學校(中文)': school_nm[0]}) c.update({'學校(英文)': school_nm[1]}) c.update({'地區': link.text}) print(c) break break break
def getInterfacesFromReport(method_name, dict_link, folder_path, crsr, caller): """ This method recursively discovers all of the branches of invocation stemming off of the initial function passed into the script. It checks for cycles in the graph. """ if loading_count[0] > 50: loading_count[0] = 0 sys.stdout.write('\r') sys.stdout.write("Generating Interface Graph [%-50s]" % ('=' * loading_count[0])) sys.stdout.flush() #loading bar print for the console loading_count[0] += 1 if len(method_name) < 1: # edge case calls empty name return test_if_in_graph = method_name + "#" + dict_link if test_if_in_graph in interface_graph: # how to check for cycles and recursion print( '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s and is already a vertx in the interface graph.' % (method_name, caller), file=cycles_log) return if not method_name[0].isalpha(): frst = 'Non-Alpha' else: frst = method_name[0] # decide which char to use for inv tree file try: invtrees_html = open(folder_path + '\simpleinvtree_' + frst + '.html', encoding='utf8') except FileNotFoundError as e: #if there is a failure to open the file return invtrees_html = BeautifulSoup(invtrees_html, 'html.parser') invtree_blocks = str(invtrees_html).split( "\n\n") #break the whole page into individual trees inv_list_text = [] #will contain the current invocation tree # Below is to see if the method is a constructor. This important because in Understand docs the # way that it links the constructor to its place in the data dictionary changes # depending on whether the constructor is being called or is the caller (Frustrating!), making it impossible to # automatically match the constructor tree from where it is being called to its tree automatically, # which is what prompts the user input when duplicates are discovered. test_for_constructor = method_name.split('::') if len(test_for_constructor) < 2: isConstructor = False else: if test_for_constructor[0].strip() == test_for_constructor[1].strip(): isConstructor = True else: isConstructor = False if dict_link == '' or isConstructor: inv_lists = [ ] #This list will store all of the invocation trees that the script finds that come from the same Class::Method #-----regex compilation-----# get_link_regex = re.compile(r'\<a href=\"dictionary_.+?\.html#(.+)\"\>') #---------------------------# for tree in invtree_blocks: #search for the right tree original_block = tree tree = BeautifulSoup(tree, 'html.parser') tree = str(tree).replace( "|", "") # take out the | symbol understand puts in the html tree = tree.split( '\n' ) # change the tree into a list where [0] == invoker and [1:] are the invocations tree = list(map(str.strip, tree)) # strip each of the strings in the list invoker_html = tree[0] # still in HTML invoker_text = BeautifulSoup(invoker_html, 'html.parser') invoker_text = invoker_text.get_text() # turn to text if invoker_text == method_name: #found a potentially correct tree if dict_link == '' or isConstructor: inv_lists.append(original_block) #collect potential lists continue else: # normal behavior tmp_link = get_link_regex.search(invoker_html).group( 1) # get the dictionary link if tmp_link == dict_link: # we have the right name and dict_link inv_list_text.append(invoker_text + '#' + tmp_link) for index in range( 1, len(tree) ): #loop through the tree and turn the HTML to text and add to inv_list_text tmp_link = get_link_regex.search(tree[index]) method_text = BeautifulSoup(tree[index], 'html.parser') method_text = method_text.get_text() if tmp_link == None and method_text == '': # this if statement is to capture when the invocation tree is at the bottom of the page so the .split '\n\n' didn't work #handle when it as at the end of the page table_text = BeautifulSoup(tree[index + 1], 'html.parser') table_text = table_text.get_text() if table_text == 'Non-AlphaABCDEFGHIJKLMNOPQRSTUVWXYZ': #search for table at the bottom of the page break inv_list_text.append(method_text + '#' + tmp_link.group(1)) break # found the right tree so we can stop iterating over them #---------------Handling first time methods------------------# if dict_link == '': # this block is for when the method is passed in at the start of the code, so no dict link if dict_link == '' and len(inv_lists) == 0: #no invocations found print( "\n\nThis function doesn't have any interfaces! Try a different one." ) # user gave a function without interfaces exit() if dict_link == '' and len( inv_lists ) == 1: # we found an invocation tree that matches the name, assume it is right inv_list_html = inv_lists[0] inv_list_html = str(inv_list_html).split('\n') elif dict_link == '' and len( inv_lists ) > 1: #more than one, no html link to ref yet, have to get user input to decide # prompt user to tell me which one they want count = 0 for tree in inv_lists: #print out each of the method's inv trees and ask which is the right one count += 1 tree_text = BeautifulSoup(tree, 'html.parser') tree_text = tree_text.get_text() tmp = str(count) + ": " print(tmp) print(tree_text, '\n') start_letter = tree_text[0] # open the inv tree understand html page so the user can have some help deciding webbrowser.open('file://' + os.path.realpath(folder_path + '\simpleinvtree_' + frst + '.html')) print( '\n\nMultiple invocation trees with same Class::Method found. Which one would you like to generate a chart with?' ) print( 'Type the number of the tree you would like to start with.\nI have opened the HTML report in your browser as well to help you decide.' ) k = input() #have the user select a number try: k = int(k) except: k = -1 while (int(k) > count or int(k) < 1): #make sure it is a good input print("Invalid input: try again please") k = input() try: k = int(k) except: k = -1 inv_list_html = inv_lists[k - 1] # we now know which one we want inv_list_html = str(inv_list_html).split( '\n') # list of methods with link wrapping it in html if dict_link == '' and len( inv_list_html ) > 0: # if inv_list_html len is > 0 then we found something, manipulate it for recursion inv_list_text = [] count = 0 for method in inv_list_html: method_link = get_link_regex.search(method).group(1) method_text = BeautifulSoup(method, 'html.parser') method_text = method_text.get_text() #turn from HTML to text if count > 0: #methods other than the invoker have these | that understand puts method_text = method_text.replace('| ', '') if count == 0: # since the first method that is passed in by the user doesn't have a dict_link we must save it start_func_dict_link[0] = method_link method_string = method_text + "#" + method_link inv_list_text.append( method_string) #get the tree in list form for recursion count += 1 #------------Handling when the constructor has a different dict link----------------# else: if isConstructor and len( inv_lists) == 0: # there was no invocation tree found return if isConstructor and len( inv_lists ) == 1: # only one found, this must be the correct one, assume it is correct # have the update dict link on the interface graph to this new one # use this as the invocation tree inv_list_html = inv_lists[0] inv_list_html = str(inv_list_html).split('\n') elif isConstructor and len(inv_lists) > 1: # when the constructor has many definitions, make the user select the one we want if SKIP_DUPLICATES: return #this is for when the user selects they don't want to be prompted to select the right constructor count = 0 for tree in inv_lists: #print out the inv trees and open the web browser to help the user choose the right one count += 1 tree_text = BeautifulSoup(tree, 'html.parser') tree_text = tree_text.get_text() tmp = str(count) + ": " print(tmp) print(tree_text, '\n') start_letter = tree_text[0] webbrowser.open('file://' + os.path.realpath(folder_path + '\simpleinvtree_' + frst + '.html')) print( '\n\nMultiple versions of the same constructor found. Which one is called by', caller, '?') print( 'Type the number of the correct tree.\nI have opened the HTML report in your browser as well to help you decide.\nIf you don\'t wish to decide, type \'0\'' ) k = input() try: k = int(k) except: k = -1 while (int(k) > count or int(k) < 0): print("Invalid input: try again please") k = input() try: k = int(k) except: k = -1 if k == 0: #if the user selects 0, it won't continue traversing this branch return inv_list_html = inv_lists[k - 1] # we now know which one we want inv_list_html = str(inv_list_html).split( '\n') # list of methods with link wrapping it in html # get html into neat list if isConstructor and len( inv_list_html ) > 0: # if inv_list_html len is > 0 then we found something, manipulate it for recursion inv_list_text = [] for i in range( 0, len(inv_list_html) ): #iterate through the tree and get it in proper text format with dict_link method_link = get_link_regex.search(inv_list_html[i]).group(1) method_text = BeautifulSoup(inv_list_html[i], 'html.parser') method_text = method_text.get_text() if i > 0: method_text = method_text.replace('| ', '') method_string = method_text + "#" + method_link if i == 0: # have to update the graph last call with the new dict link, because of the issue with the dict link being different when it is the callee vs. the caller old_string = method_name + '#' + dict_link # string to be replaced prev_tree = interface_graph[caller] prev_tree[prev_tree.index(old_string)] = method_string interface_graph[caller] = prev_tree inv_list_text.append(method_string) #-----------Ready to fix the correct invocation list up for insert into graph----# # when we get here we have inv_list_text with the list of invocations in order inv_list_text[0] == invoker and [1:] invocations if len(inv_list_text) >= 2: # there are invoked methods if inv_list_text[ 0] not in interface_graph: # double check to make sure there won't be a cycle created i = 1 while i < len(inv_list_text): if '(Virtual)' in inv_list_text[ i]: #if virtual is on there strip it off for the graph inv_list_text[i] = inv_list_text[i].replace( ' (Virtual)', '').strip() if inv_list_text[i] in interface_graph or inv_list_text[ i] == inv_list_text[ 0]: #deal with cycles caused by recursion or edges # outputs to log file. This case will be triggered if a method exists in the current invocation tree and it has # already been added to the graph as another vertex. This maintains integrity of acyclic quality for topological sort print( '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s, and is already a vertx in the interface graph.' % (inv_list_text[i], inv_list_text[0]), file=cycles_log) # warning to the user del inv_list_text[ i] # delete the edge causing the cycle for the sake of the topological sort i -= 1 i += 1 if len( inv_list_text ) >= 2: # after del edges that make cycles make sure there at least one edge coming from the vertex interface_graph[inv_list_text[0]] = inv_list_text[1:] else: return else: # this only gets triggered if the dict_link of the invoker method changed between the start of this method's execution and here print( '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s and is already a vertx in the interface graph.' % (inv_list_text[0], caller), file=cycles_log) return for i in range(1, len(inv_list_text) ): #time to recurse over the branches left in the tree tmp = inv_list_text[i].split( '#') # break the method from its dict link method_name = tmp[0] dict_link = tmp[1] # first check if the method is in the DB, if it is we assume that that the branch from that method has been fully traversed if not methodInDB(method_name, dict_link, crsr)[0]: getInterfacesFromReport( method_name, dict_link, folder_path, crsr, inv_list_text[0]) # calls getInt. on next method
text = BeautifulSoup(text, features="html.parser") # removes html tags text = text.get_text() # removes target=blank Markdown tags text = text.replace("{:target=\"_blank\"}", '') # removes Markdown links text = regex.sub(linkremover, '', text) # removes anything that isn't an alphabetical character and casts the remaining string to lowercase text = regex.sub(nonalphabeticalremover, ' ', text).lower() wordcount += len(text.split()) # nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/ tokens = nltk.word_tokenize(text) stemmer = nltk.stem.PorterStemmer() stemmed_tokens = map(lambda x: stemmer.stem(x), tokens) for token in stemmed_tokens: if token in uniquewords: newVal = uniquewords.get(token) + 1 uniquewords.update({token: newVal}) else: uniquewords.update({token: 1}) continue if wordcount < 1:
for srow in standinglist: srowPrintable = str(srow) srowPrintable = srowPrintable.replace('[', '') srowPrintable = srowPrintable.replace(']', '') srowPrintable = srowPrintable.replace('\'', '') srowCSV = srowPrintable.split(',') swr.writerow(srowCSV) with open('ol.csv', 'w', encoding="utf-8", newline='') as ol_file: olwr = csv.writer(ol_file) for olrow in oltable_rows: olrowPrintable = str(olrow) olrowPrintable = olrowPrintable.replace(',', '.') olrowPrintable = olrowPrintable.replace(' (R)', '') olrowPrintable = BeautifulSoup(olrowPrintable, "lxml").get_text(separator=',') olrowCSV = olrowPrintable.split(',') olwr.writerow(olrowCSV) with open('r.csv', 'w', encoding="utf-8", newline='') as r_file: rwr = csv.writer(r_file) for rrow in rtable_rows: rrowPrintable = str(rrow) rrowPrintable = rrowPrintable.replace(',', '.') rrowPrintable = rrowPrintable.replace(' (R)', '') rrowPrintable = BeautifulSoup(rrowPrintable, "lxml").get_text(separator=',') rrowCSV = rrowPrintable.split(',') rwr.writerow(rrowCSV) with open('p.csv', 'w', encoding="utf-8", newline='') as p_file: pwr = csv.writer(p_file)
# removes html tags text = text.get_text() # removes target=blank Markdown tags text = text.replace("{:target=\"_blank\"}", '') # removes Markdown links text = regex.sub(linkremover, '', text) # removes anything that isn't an alphabetical character and casts the remaining string to lowercase text = regex.sub(nonalphabeticalremover, ' ', text).lower() blob = TextBlob(text) sentiments.update({filename: blob.sentiment.polarity}) wordsperpost.update({filename: len(text.split())}) wordcount += len(text.split()) # nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/ tokens = nltk.word_tokenize(text) stemmer = nltk.stem.PorterStemmer() stemmed_tokens = map(lambda x: stemmer.stem(x), tokens) for token in stemmed_tokens: if token in stems: newVal = stems.get(token) + 1 stems.update({token: newVal}) else: stems.update({token: 1})
stems = stem_tokens(words, stemmer) return stems #Pre-processing step print("Pre-processing documents...") for file in file_name: if(counter < 201): start = timer() with open(os.path.join(folder_dir,rel_path,file), 'rb') as f: read_data = f.read() #Read from file input_str = BeautifulSoup(read_data, "lxml").get_text() # Extract text from document input_str = input_str.casefold() #Convert to lower-case input_str = re.sub(r'\d+', '', input_str) #Remove numbers input_str = input_str.translate(str.maketrans("","",string.punctuation)) #Remove punctuation input_str = " ".join(input_str.split()) #Removes whitespaces input_str = input_str.replace("\n"," ") #Removes newline input_str = unicodedata.normalize("NFKD", input_str) #Removes unicode characters. corpus[file] = input_str print(counter) counter+=1 f.close() else: break #print(list(corpus.values())[0]) --Print first document's text for testing values = [] files = [] for k,v in corpus.items(): values.append(v) files.append(k)
def beautify(self,element): ans= BeautifulSoup(str(element),'lxml').text ans=ans[1:-1] return ans.split(', ')
def collectGlobals(folder_path): """ Method to get all of the global and public objects from the understand docs It uses the the object cross reference pages of the understand docs """ print("Collecting global/public objects from this project...") glbl_connection = sql.connect("globals.db") #creates a globals.db file glbl_crsr = glbl_connection.cursor() #crsr to execute commands create_globals_table = """CREATE TABLE globals ( global_id INTEGER PRIMARY KEY, var_name VARCHAR(255), method_used VARCHAR(200), /* the method signature of where it was used */ use_loc VARCHAR(255) /* line where it was used */ );""" glbl_crsr.execute( create_globals_table) #inline sql to create table for objects ref_page = 'Non-Alpha' count = 0 #---regex compiles----# # stops the code from recompiling the regex and adding it to the cache every loop global_regex = '.+Global Object\)' #regex to search for the objs labeled public/global global_regex = re.compile(global_regex) public_regex = '.+Public Object\)' public_regex = re.compile(public_regex) static_remove = re.compile(' \(Static') use_search = re.compile(' Use ') set_search = re.compile(' Set ') #---------------------# while ref_page == 'Non-Alpha' or (ord(ref_page) <= ord( 'Z')): # loop to loop through each of the pages for the variables print("Collecting global variables that start with ", ref_page, "...") objxref_html = open(folder_path + '\object_xref_' + ref_page + '.html') objxref_html = BeautifulSoup( objxref_html, 'html.parser') # get the page's html in a parsable object obj_blocks = str(objxref_html).split( '\n\n') # break the page into units for each object for block in obj_blocks: matched = False if global_regex.search(block.split('\n')[0]): obj = 'Global' matched = True elif public_regex.search(block.split('\n')[0]): obj = 'Public' matched = True if matched: #if the block has been identified as public/global object block_txt = BeautifulSoup(block, 'html.parser').getText() block_txt = block_txt.split('\n') tmp = block_txt[0].split( '%s Object) Declared as: ' % obj) #split up the first line's information if len(tmp) > 1: #there is a Declared as: var_name = tmp[1].strip() + " " + tmp[0].strip() else: # no Declared as var_name = tmp[0].split()[0].strip() if static_remove.search( var_name): #get rid of the static part if it's there var_name = var_name[:-8].strip() elif '(' in var_name: var_name = var_name[:-2].strip() used_in_methods = { } # now search the next lines of the block to find which methods the obj is used in for line in block_txt[1:]: if (use_search.search(line)) or (set_search.search(line)): line = line.split() use_loc = line[-3] + " " + line[-2] method_used = line[-1].strip() if method_used not in used_in_methods: #for the case where there are multiple lines in one method used_in_methods[method_used] = [ use_loc ] # where the obj is used method = key, lines used = val else: used_in_methods[method_used].append(use_loc) if len(used_in_methods) > 0: for method in used_in_methods: # if there were places where the obj was used insert into the db glbl_crsr.execute( 'INSERT INTO globals (global_id, var_name, method_used, use_loc) VALUES (?, ?, ?, ?)', ( count, var_name, method, str(used_in_methods[method]), )) count += 1 if ref_page == 'Non-Alpha': ref_page = 'A' else: ref_page = chr(ord(ref_page) + 1) # save all changes made glbl_connection.commit() # finished glbl_connection.close()
from bs4 import BeautifulSoup f = open("words.txt", "r") g = open("words.final.txt", "w") for line in f: k = BeautifulSoup(line).text k = " ".join([ x for x in k.split(" ") if len(x) > 5 and not (x.startswith("@") or x.startswith(".mas") or "http" in x or "png" in x or "jpg" in x) ]) if len(k) > 1: g.write("{}\n".format(k.strip())) f.close() g.close()
def update(update, context): """ Обновляет ленту, отправляя новые посты с rss ленты """ try: with open('sites.json', 'r') as file: site_list = json.load(file) new_site_list = [] for site in site_list: raw_news = feedparser.parse( site['url'])['entries'][::-1] #запись новостей в raw файл date = site['date'] keywords = site['keywords'] new_date = raw_news[-1]['published'] if keywords == [] and date == None: for new in raw_news: time.sleep(0.01) summary = BeautifulSoup(new['summary'], 'html.parser').get_text() summary = sentence(summary) mes = new['title'] + "\n\n" + summary + "\n" + new[ 'link'] #создание сообщения для отправки context.bot.send_message(chat_id=update.effective_chat.id, text=mes) elif keywords == [] and date != None: for new in raw_news: time.sleep(0.01) summary = BeautifulSoup(new['summary'], 'html.parser').get_text() summary = sentence(summary) if new['published'] != date: mes = new['title'] + "\n\n" + summary + "\n" + new[ 'link'] #создание сообщения для отправки context.bot.send_message( chat_id=update.effective_chat.id, text=mes) else: break elif keywords != [] and date == None: time.sleep(0.01) stemmer = SnowballStemmer("russian") kwords = set([stemmer.stem(i) for i in keywords]) for new in raw_news: summary = BeautifulSoup(new['summary'], 'html.parser').get_text() summary = sentence(summary) word_set = set( [stemmer.stem(i) for i in summary.split(' ')]) if word_set & kwords != set(): mes = new['title'] + "\n\n" + summary + "\n" + new[ 'link'] #создание сообщения для отправки context.bot.send_message( chat_id=update.effective_chat.id, text=mes) elif keywords != [] and date != None: time.sleep(0.01) stemmer = SnowballStemmer("russian") kwords = set([stemmer.stem(i) for i in keywords]) for new in raw_news: summary = BeautifulSoup(new['summary'], 'html.parser').get_text() summary = sentence(summary) word_set = set( [stemmer.stem(i) for i in summary.split(' ')]) if word_set & kwords != set() and new['published'] != date: mes = new['title'] + "\n\n" + summary + "\n" + new[ 'link'] #создание сообщения для отправки context.bot.send_message( chat_id=update.effective_chat.id, text=mes) else: break new_site_list.append({ 'name': site['name'], 'url': site['url'], 'date': new_date, 'keywords': site['keywords'] }) with open('sites.json', 'w') as file: json.dump(new_site_list, file) except FileNotFoundError: context.bot.send_message( chat_id=update.effective_chat.id, text='Нет списка сайтов, задайте его командой /sub') except Exception as exxx: context.bot.send_message(chat_id=update.effective_chat.id, text=('Возникла следующая ошибка ' + str(exxx)))
rows = table.find_all('tr') #print(len(rows)) if (flag == 0): row_th = rows[0].find_all('th') str_cells = str(row_th) header = BeautifulSoup(str_cells, "lxml").get_text() flag = 1 for row in rows[1:]: row_td = row.find_all('td') str_cells1 = str(row_td) cleantext = BeautifulSoup(str_cells1, "lxml").get_text() # create a list from cleantext stop_event_rows = cleantext.split(", ") # strip "[" from first element of the list and "]" from the last element of the list x = stop_event_rows[0].split("[") stop_event_rows[0] = x[1] size = len(stop_event_rows) x = stop_event_rows[size - 1].split("]") stop_event_rows[size - 1] = x[0] for _ in range(len(stop_event_rows)): data = {} data["trip_id"] = trip data["vehicle_number"] = stop_event_rows[0] data["leave_time"] = stop_event_rows[1] data["train"] = stop_event_rows[2]
def _parse_energy_level_section(str, last_data=None): data = {} splitted_str = str.split('\n') for i, line in enumerate(splitted_str): clean_str = BeautifulSoup(line.strip(), "lxml").text if sys.version_info[0] < 3: # f**k python2 btw. clean_str = clean_str.encode("utf-8") if clean_str.strip() == '': continue if i == 0: data['configuration'] = clean_str.replace('\xa0', '') if i == 1: data['term'] = clean_str.replace('\xa0', '') if i == 3: # print("i == 3 : ", clean_str) if '?' in clean_str.strip(): clean_str = clean_str.replace('?','') if ',' in clean_str: clean_str = clean_str.split(',')[0] if '/' in clean_str.strip(): # resplit = re.split("a?\/a?", clean_str) # data['J'] = float(resplit[0].replace(' ', '')) / float(resplit[1]) resplit = clean_str.strip().split('/') try: data['J'] = float(resplit[0]) / float(resplit[1]) except ValueError: print("clean_str = ", clean_str) exit() # print("data['J'] = ", data['J']) else: try: data['J'] = float(clean_str.strip()) except: logger.error("Could not read: {0}".format(clean_str.strip())) # if ',' in clean_str: # data['J'] = clean_str.strip() # else: # resplit = re.split("a?\/a?", clean_str) # try: # if len(resplit) == 2: # data['J'] = float(resplit[0].replace(' ', '')) / float(resplit[1]) # else: # data['J'] = int(clean_str.strip()) # except ValueError: # logger.error("Could not read: {0}".format(clean_str.strip())) if i == 4: clean_str = clean_str.strip().replace(' ', '').replace('(','').replace(')','').replace('[','').replace(']','') refind1 = re.findall(r"\d+\.\d+", clean_str.replace(' ', '')) if type(refind1) == float: data['level (eV)'] = refind1 elif len(refind1) == 1: # print("refind1 = ", refind1, " | clean_str = ", clean_str) data['level (eV)'] = float(refind1[0]) else: data['level (eV)'] = float(clean_str) # print("refind1 = ", refind1, " | clean_str = ", clean_str, " | data['level (eV)'] = ", # data['level (eV)']) try: if i == 5: data['uncertainty (eV)'] = float(clean_str.replace(' ', '')) except ValueError: logger.error("Could not read: {0}".format(clean_str.replace(' ', ''))) if i == 6: data['level splittings (eV)'] = float(clean_str.replace(' ', '')) try: if i == 7: data['leading percentages'] = float(clean_str) except ValueError: # leading percentage is not always there if i == 7: data['reference'] = clean_str.replace('\xa0','') if 'configuration' not in data: data['configuration'] = '' if 'term' not in data: data['term'] = '' if data['configuration'] == '': # data['configuration'] = last_data['configuration'] if data['term'] == '': data['term'] = last_data['term'] return data
def parsePage(url): r = requests.get(url) data = r.text soup = BeautifulSoup(data) invalid_tags = ['b', 'i', 'u', 'ul','li', 'p','em'] soup = soup.find(id='primary') for tag in invalid_tags: for match in soup.findAll(tag): match.replaceWithChildren() for match in soup.findAll('span'): match.replaceWith('') for match in soup.findAll('div'): match.replaceWith('') soup = str(soup) soup = soup.replace('<strong>', "%") soup = soup.replace('</strong>', "%") finalOutput = soup.split('%') for n in range(0,4): finalOutput[n]="" return finalOutput