Ejemplo n.º 1
0
def fullPPtoW( review, re_level, sw_drop, stem, join_res=True ):
	''' git description
+ __fullPPtoW__( review, re_level, sw_drop, stem, join_res=True ) :
    + _does_ : Computes a full "review" string pre-processing, according to ("re_level", "sw_drop", "stem") parameters
    + _returns_ : Treated "review" (as _list_ or _string_ depending on "join_res")
    + _called by_ : __fullPPtoS__, __run__, __submission.run__
    + _calls_ : __reTreatment__, __rmStopWords__, __pStem__, __bs4.BeautifulSoup__
    + _arguments_ :
        
| type | name | description |
| --- | --- | --- |
| _string_ | review | Review to be pre-processed |
| _int_ | re_level | Level of Regex treatment (0-3) |
| _boolean_ | sw_drop | Should drop stop words |
| _boolean_ | stem | Should apply Porter Stemming |
| _boolean_ | join_res | Should return result as string (else as list of words) |
	'''
	
	result = BeautifulSoup( review )
	result = result.get_text().lower()
	result = reTreatment( result, re_level )
	if sw_drop:
		result = rmStopword( result.split() )
		if stem:
			result = pStem( result )
	else:
		if stem:
			result = pStem( result.split() )
		else:
			result = result.split()
	if join_res:
		return ( " ".join(result) )
	else:
		return result
Ejemplo n.º 2
0
def lexical_data(html_file, encoding="utf-8"):
    SEP = '_ENTRY'
    html = open(html_file, encoding=encoding).read()
    html = re.sub(r'<p', SEP + '<p', html)
    text = BeautifulSoup(html).get_text()
    text = ' '.join(text.split())
    for entry in text.split(SEP):
        if entry.count(' ') > 2:
            yield entry.split(' ', 3)
Ejemplo n.º 3
0
 def clean_sentence(self,sentence):
     if self.html_clean:
         sentence = BeautifulSoup(sentence).get_text()   #   removing html markup
     sentence = sentence.lower() #   everything to lowercase
     # sentence = ''.join(x for x in sentence if x.isalnum() or x==" ")
     for ch_rep in self.clean_list:
         sentence = re.sub(ch_rep[0],ch_rep[1],sentence)
     sentence = ' '.join(filter(lambda x:x not in self.stopwords_eng,sentence.split()))
     sentence = ' '.join(filter(lambda x:len(x) > 1,sentence.split()))
     sentence = sentence.strip(" ") # Remove possible extra spaces
     if self.split_words:
         sentence = sentence.split()
     return sentence
Ejemplo n.º 4
0
def clean_data(dados):
    only_table_tags = SoupStrainer("td") #Define a Tag parseada
    soup = BeautifulSoup(dados, 'html.parser', parse_only=only_table_tags)
    soup = soup.get_text('|', strip=True)
    valor = [valor.replace(",", ".") for valor in soup.split('|')]
    date = datetime.strptime(valor[0], "%d/%m/%Y")
    valor[0] = date.strftime("%Y/%m/%d")

    #A trabalhar ######
    # try:
    #     cursor.execute("""
    #         INSERT INTO dollar (dollar_dia, dollar_compra, dollar_venda)
    #         VALUES (%s, %s, %s)
    #         ON DUPLICATE KEY UPDATE dollar_dia=%s",(str(valor[0]), str(valor[1]), str(valor[2]), str(valor[0]))""")
    # except mariadb.Error as error:
    #     print("Error: {}".format(error))
    # mariadb_connection.commit()

#Otimizar as consultas!
    if cursor.execute("SELECT * FROM dollar WHERE dollar_dia = %s", (valor[0])) < 1:
        try:
            cursor.execute("INSERT INTO dollar (dollar_dia, dollar_compra, dollar_venda) VALUES (%s, %s, %s)",(str(valor[0]), str(valor[1]), str(valor[2])))
        except mariadb.Error as error:
            print("Error: {}".format(error))
    else:
        try:
            cursor.execute("UPDATE dollar SET dollar_compra = %s, dollar_venda=%s WHERE dollar_dia = %s",(str(valor[1]), str(valor[2]), str(valor[0])))
        except mariadb.Error as error:
            print("Error: {}".format(error))
    mariadb_connection.commit()
def process_strings( string ):
    # 1. Remove HTML
    words = BeautifulSoup(string).get_text()
    
       
    # separate joint words
    words = re.sub('(\w+)([A-Z][a-z]+)',lambda m:  " " + m.group(1) +\
               " " + m.group(2),  words  )
    
    # 3. Convert to lower case
    words = words.lower() 
    
    # remove unwanted characters
    ddd = re.sub('[^a-zA-Z0-9\s]', " ", words )
    ddd2 = re.sub( "(\d+)x(\d+)", lambda m: m.group(1) + " " + m.group(2)  , ddd )
    ddd3 = re.sub( "(\d+)x\s", lambda m: m.group(1) + " ", ddd2 )
    ddd4 = re.sub( "\sx(\d+)", lambda m:  " " + m.group(1), ddd3 )
    ddd5 = re.sub( "\sx\s",  " " , ddd4 )
    fff = re.sub( "(\D+)(\d+)", lambda m:  m.group(1) + " " + m.group(2), ddd5 ) 
    fff2 = re.sub( "(\d+)(\D+)", lambda m:  m.group(1) + " " + m.group(2), fff )
    words = re.sub( "(\d+)(\D+)(\d+)", lambda m:  m.group(1) + " " + m.group(2) + " " \
                  + m.group(3), fff2)
    for  i in range(1,10):   
      words = re.sub('\s(ft|sq|in|gal|cu|h|oz|dia|yd|yds|a|p|qt|ah|amp|gpm|mp\
                       |quart|watt|cc|d|inc|incl|lb|lbs|lin|ln|mil|mm|no|n|oc\
                       |od|pc|pal|pt|s|sch|cs|case|pallet|w)\s'  , lambda m: " ", words )
        
    # Join the words back into one string separated by space
    return (  words.split() )
Ejemplo n.º 6
0
def clean_str(review_docs, method=2):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    output_docs = []
    if(method == 1):
        for string in review_docs:
            string = BeautifulSoup(string, "lxml").get_text()
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            string = string.strip().lower()
            string = string.split(" ")
            output_docs.append(string)
    elif(method==2):
        for string in review_docs:
            words = gensim.utils.to_unicode(string).split()
            output_docs.append(words)
    return output_docs
Ejemplo n.º 7
0
def reviewToWordList(rawReview, removeStopWords = False):
    """
    Converts a document to sequence of words
    optionally removing stop words
    will later extend to optionally remove numbers
    
    I/O
    -Input: raw html in string form
    -Output: list of words
    """
    
    #Remove HTML
    cleanedReview = BeautifulSoup(rawReview).get_text()
    
    #Remove non-letters
    cleanedReview = re.sub("[^a-zA-Z]",
                           " ",
                           cleanedReview)
    
    #Convert words to lowerCase
    cleanedReview = cleanedReview.lower()
    
    #Split Words
    wordList = cleanedReview.split()
    
    #Optionally remove stop words
    if ( removeStopWords ):
        stops = set(stopwords.words('english'))
        wordList = [ word for word in wordList if word not in stops]
    
    #Return list of words
    return(wordList)
Ejemplo n.º 8
0
def reviewToWords(rawReview):
    """
    Converts raw review to a string of words
    -Input is single html string
    -Output is preprocessed single string
    """
    cleanedReview = None
    
    #Remove HTML
    cleanedReview = BeautifulSoup(rawReview)
    
    #Remove numbers and punctuation
    cleanedReview = re.sub("[^a-zA-Z]",
                           " ",
                           cleanedReview.get_text())
    
    #Make all words lowercase
    cleanedReview = cleanedReview.lower()
    
    #Split into individual words
    cleanedReviewWords = cleanedReview.split()
    
    #Convert to set instead of list for efficiency
    stops = set(stopwords.words("english"))
    
    #Remove stop words
    meaningfulWords = [word for word in cleanedReviewWords if word not in stops]
    
    #Join words back into one string
    return (" ".join( meaningfulWords ))
Ejemplo n.º 9
0
 def get_data_for_vine_id(self, vine_id, timeout=30):
     try:
         page = requests.get("https://vine.co/v/{}".format(vine_id), timeout=timeout)
     except requests.exceptions.RequestException as e:
         error_message = "Problem with comminicating with vine page - {}".format(e)
         raise PresserRequestError(error_message)
     if page.ok:
         content = BeautifulSoup(page.content)
         all_script_tags = content.find_all("script")
         potential_script_tags = [script for script in all_script_tags if not script.has_attr("src")]
         script_lines = []
         for tag in potential_script_tags:
             for content in tag.contents:
                 for line in content.split(";\n"):
                     if line.count("window.POST_DATA"):
                         script_lines.append(line.replace("window.POST_DATA = ", ""))
         if len(script_lines) > 1:
             raise PresserJavaScriptParseError("More POST_DATA extracted than expected")
         if not script_lines:
             raise PresserJavaScriptParseError("No POST_DATA extracted for id {}".format(vine_id))
         script_line = script_lines[0].replace("POST = ", "")
         try:
             data = execjs.eval(script_line)
             vine = data[vine_id]
             return vine
         except execjs.RuntimeError as e:
             error_message = "Problem with parsing, check parsing logic. {}".format(e)
             raise PresserJavaScriptParseError(error_message)
     elif page.status_code == 404:
         raise Presser404Error("{} could not be found".format(page.url))
     else:
         raise PresserURLError("{} could not be accessed {} - {}".format(page.url, page.status_code,page.content))
def extract_filer_info_from_fulltext(filings):
    for idx,filing in enumerate(filings):
        soup = BeautifulSoup(filing['fulltext'],'lxml')
        soup = soup.find('sec-header')
        soup = soup.get_text()
        header = {'ITEM INFORMATION':[]}
        sub_head = {'flag':0}
        for line in soup.split('\n'):
            if ':' not in line:
                sub_head = {'flag':0}
                continue
            k = line.split(':')[0].strip()
            v = line.split(':')[1:]
            v = u''.join(list(v)).strip()
            if v == '':
                sub_head['flag'] = 1
                sub_head['value'] = k
                header[k] = {}
            elif sub_head['flag']:
                header[sub_head['value']][k] = v
            elif k == 'ITEM INFORMATION':
                header[k].append(v)
            else:
                header[k] = v
        filings[idx] = dict(filing.items() + header.items())
    return filings
Ejemplo n.º 11
0
def get_file_by_url(url):
    """
    Get a file data located at a particular URL.

    Parameters
    ----------
    url : str
        The URL at which to access the data.

    Returns
    -------
    url_data : str or None
        The data retrieved at that URL from the file. Returns None if the
        attempted retrieval is unsuccessful.

    Note
    ----
    - BeautifulSoup is used in this case to avoid having to search in which
        format we have to encode or decode data before parsing it to UTF-8.
    """

    try:
        f = urlopen(url)
        soup = BeautifulSoup(f.read(), 'lxml').get_text()
        return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
    except Exception:
        print("Problem getting file: ", url)
Ejemplo n.º 12
0
def nortonrate(u,logs=True,returning=False,timeout=15,proxy=None):
 '''
   this function takes any giving and gives a security report from: safeweb.norton.com, if it is a: spam domain, contains a malware...
   it takes 3 arguments:
   u: the link to check
   logs: (set by default to: True) showing the process and the report, you can turn it off by setting it to:False
   returning: (set by default to: False) returning the report as a string format if it is set to: True.
   usage:
   >>>import bane
   >>>url='http://www.example.com'
   >>>bane.nortonrate(domain)
'''
 if proxy:
  proxy={'http':'http://'+proxy}
 s=""
 try:
  if logs==True:
   print'[*]Testing link with safeweb.norton.com'
  ur=urllib.quote(u, safe='')
  ul='https://safeweb.norton.com/report/show?url='+ur
  c=requests.get(ul, headers = {'User-Agent': random.choice(ua)},proxies=proxy,timeout=timeout).text 
  soup = BeautifulSoup(c, "html.parser").text
  s=soup.split("Summary")[1].split('=')[0]
  s=s.split("The Norton rating")[0].split('=')[0]
  if logs==True:
   print'[+]Report:\n',s.strip()
 except:
  pass
 if returning==True:
  return s.strip()
Ejemplo n.º 13
0
    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list
Ejemplo n.º 14
0
def gsearch( pages, Query ):
    Query = str( Query )
    Query = str( sub(' ', '+', Query ) )
    base = 'https://www.youtube.com'
    url = str( base + '/results?search_query=' + Query )
    print( url )
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_handle_robots(False)
    br.set_handle_equiv(False)
    br.addheaders = [('User-agent', UAS)]
    data = br.open( url )
    soup = BeautifulSoup( data.read() )
    gparse( soup )
    if pages > 1:
     i = 2
     soup = str( soup )
     for a in soup.split('href='):
        a = str( a )
        strn = str( 'page=' + str( i ) )
        if strn in a:
            b = str( a.split('"')[1] )
            b = str( sub('&amp;','&',b) )
            url = str( base + b )
            Next( url, i, pages )
            break
Ejemplo n.º 15
0
def gsearch( pages, Query ):
    url = 'https://encrypted.google.com'
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.set_handle_equiv(False)
    br.addheaders = [('User-agent', UAS)]
    br.open( url )
    br.select_form(name='f')
    br.form['q'] = Query
    data = br.submit()
    soup = BeautifulSoup(data.read())
    gparse( soup )
    if pages > 1:
     soup = str( soup )
     for a in soup.split('"'):
       aye = str( 'start=10' )
       if aye in str(a):
          url = str( url + str(a) )
          Next( url, 1, pages )

    nrslt = []
    global grslt
    lg = len( grslt )
    for e in range(1, lg):
       g = str( grslt[e] )
       e = str( e )
       z = str( e + ' | google | ' + g )
       nrslt.append(z)
    return nrslt
Ejemplo n.º 16
0
    def _old_parse_article_html(self, objectId, title, industry_press=None):
        df = Google().search("{0} site:marketwired.com".format(title))
        html = Google().cache(df.link.tolist()[0])
        article = BeautifulSoup(html).find("div",{"class":"mw_release"})
        article = article.text if article else None
        #company_name = BeautifulSoup(html).find("span",{"itemprop":"name"})
        company_name = BeautifulSoup(html).find("strong")
        company_name = company_name.split("SOURCE:")[-1] if company_name else None
        #q.enqueue(ClearSpark()._bulk_company_info, company_name)
        links, website = [], None
        for a in BeautifulSoup(html).find_all("a"):
            if "href" not in a.attrs: continue
            href = a["href"].lower()
            if "http" not in href: continue
            elif "marketwire" in href: continue
            elif "javascript" in href: continue
            elif "linkedin" in href: continue
            elif "twitter" in href: continue
            elif "youtube" in href: continue
            elif "flickr" in href: continue
            elif "facebook" in href: continue
            elif "google" in href: continue
            elif "addthis" in href: continue
            elif "sysomos" in href: continue

            if "target" in a.attrs:
                website = a["href"]
            links.append(href.strip())

        info = {"article": article, "company_name": company_name, 
                "website":website, "links":links}
        return info
Ejemplo n.º 17
0
def bsearch( pages, Query ):
  baselink = 'https://bing.com'
  br = mechanize.Browser(factory=mechanize.RobustFactory())
  br.set_handle_robots(False)
  br.addheaders = [('User-agent', UAS)]
  r = br.open(baselink)
  html = r.read()
  br.select_form(nr=0)
  br.form['q'] = Query
  br.submit()
  soup = BeautifulSoup(br.response().read())
  for item in (soup.select("h2")):
     plink(item)       
  for item in(soup.select("a")):
     plink(item)
  if pages > 1:
     soup = str( soup )
     for b in soup.split('"'):
       b = str( b )
       if 'PORE' in b:
           url = str( baselink + str(b) )
           url = str( unquote(url ) )#.decode('utf-8') )
           Next( url, 1, pages )

  nrslt = []
  global brslt
  lg = len( brslt )
  for e in range(1, lg):
      g = str( brslt[e] )
      e = str( e )
      z = str( e + ' | bing | ' + g ).decode('utf-8')
      nrslt.append(z)
  return nrslt
Ejemplo n.º 18
0
def cleanReview(path):
    data = []
    for f in os.listdir(path):
        filePath = os.path.join(path, f)
        with open(filePath, 'r', encoding ='utf-8') as theFile:
            data.append(theFile.read().lower())
    
    for i in range(len(data)):      
        review = data[i]
           
        # Remove HTML tags
        review = BeautifulSoup(review, "html.parser").get_text()

        # Remove non alphanumerics
        review = re.sub('[^a-zA-Z0-9]', ' ', review)

        # Tokenize
        tokens = review.split()
        
        # Remove stop words
        #stops = set(stopwords.words("english"))
        #tokens = [w for w in tokens if not w in stops]
        
        # Remove empty strings
        tokens = filter(None, tokens)
        
        review = ( " ".join(tokens))

        data[i] = review
    return data
Ejemplo n.º 19
0
def ysearch( pages, Query ):
  url = 'https://search.yahoo.com'
  br = mechanize.Browser(factory=mechanize.RobustFactory())
  br.set_handle_robots(False)
  br.addheaders = [('User-agent', UAS)]
  r = br.open( url )
  html = r.read()
  br.select_form(nr=0)
  br.form['p'] = Query
  br.submit()
  soup = BeautifulSoup(br.response().read())
  gparse( soup )
  if pages > 1:
     soup = str( soup )
     for e in soup.split('<a'):
       e = str( e )
       if 'class="next"' in str(e):
          url = str( sub(' class="next" href=', '', e ) )
          url = str( url.split('"')[1] )
          url = str( unquote(url) ).decode()
#          url = str( sub( '&amp;', '&', url ) )
          Next( url, 1, pages )

  nrslt = []
  global yrslt
  i = 1
  lg = len( yrslt )
  for e in range(1, lg):
     g = str( yrslt[e] )
     e = str( e )
     z = str( e + ' | yahoo | ' + g )
     nrslt.append(z)
  return nrslt
Ejemplo n.º 20
0
def clean_htm_file(original_dir,cleaned_dir,file):

    original_content = open(os.path.join(original_dir,file))
    original_lines = original_content.readlines()
    original_len = len(original_lines)
    original_size = os.path.getsize(os.path.join(original_dir,file))

    # Make Line
    if float(original_len)/float(original_size) <= 0.0008:
        original_content = open(os.path.join(original_dir,file))
        unformatted_content = BeautifulSoup(original_content,'lxml')
        formatted_content = unformatted_content.prettify()
        untagged_content = BeautifulSoup(formatted_content, 'html.parser').get_text().encode('utf8')
        original_content.close()

        # formatted_file = open(os.path.join(os.path.join(cleaned_dir, 'Manual_Clean'),file.split('.')[0] + '.txt'),'w')
        formatted_file = open(os.path.join(cleaned_dir,file.split('.')[0] + '.txt'),'w')
        untagged_content = untagged_content.split('\n')

        for uc in untagged_content:

            if '\xc3' or '\xbd' or '\xc2' or '\xa0' or '\xe2' or '\x80' or '\x99' or '\x9c' or '\x9d' or '\x94' or '\x97' or '\xa6' or '\xa2' or'\'' or '\x96' or '\xb7' or '\x92' or '\x93' or '\x8f' or '\x95' in uc:
                uc = uc.replace('\xc3','').replace('\xbd','').replace('\xc2','').replace('\xa0','').replace('\xe2','').replace('\x80','').replace('\x99','').replace('\x9c','').replace('\x9d','').replace('\x94','').replace('\x97','').replace('\xa6','').replace('\xa2','').replace("\'",r"'").replace('\x96','').replace('\xb7','').replace('\x92','').replace('\x93','').replace('\x8f','').replace('\x95','')

                if uc != '' and not uc.isspace():
                    formatted_file.write(uc.lstrip() + '\n')

        formatted_file.close()

    # Use the BeautifulSoup to clean the tags
    else:
        original_content = open(os.path.join(original_dir, file))
        untagged_content = BeautifulSoup(original_content,'html.parser').get_text().encode('utf8')
        original_content.close()

        untagged_file = open(os.path.join(cleaned_dir,file.split('.')[0] + '.txt'),'w')
        untagged_content = untagged_content.split('\n')

        for uc in untagged_content:

            if '\xc3' or '\xbd' or '\xc2' or '\xa0' or '\xe2' or '\x80' or '\x99' or '\x9c' or '\x9d' or '\x94' or '\x97' or '\xa6' or '\xa2' or'\'' or '\x96' or '\xb7' or '\x92' or '\x93' or '\x8f' or '\x95' in uc:
                uc = uc.replace('\xc3','').replace('\xbd','').replace('\xc2','').replace('\xa0','').replace('\xe2','').replace('\x80','').replace('\x99','').replace('\x9c','').replace('\x9d','').replace('\x94','').replace('\x97','').replace('\xa6','').replace('\xa2','').replace("\'",r"'").replace('\x96','').replace('\xb7','').replace('\x92','').replace('\x93','').replace('\x8f','').replace('\x95','')

                if uc != '' and not uc.isspace():
                    untagged_file.write(uc + '\n')

        untagged_file.close()
Ejemplo n.º 21
0
def getartist():
    try:
        #new request
        response = requests.get(stream_url, headers={'Icy-MetaData': 1}, stream=True)
        response.raise_for_status()
        headers, stream = response.headers, response.raw
        meta_int = headers.get('icy-metaint')
        meta_byte = stream.read(1)
        if (meta_byte):
            meta_length = ord(meta_byte) * 16
            meta_data = stream.read()
            mymetadata = BeautifulSoup(meta_data, 'html.parser')
            mymetadata = str(mymetadata.get_text())
            mymetadata = mymetadata.split("playing:",1)[1]
            mymetadata = (mymetadata.split("Support",1)[0]).rstrip()
            return mymetadata
    except KeyboardInterrupt:
        return "Failed"
def categories_files():
    os.makedirs('./Categories_new/', exist_ok=True)
    for filename in os.listdir('./Categories'):
        with open('./Categories/' + filename) as text:
            ok_text = BeautifulSoup(text.read(), "lxml").get_text()
            sentences = ok_text.split('.')
            with open('./Categories_new/{}'.format(filename), 'w', encoding='utf-8') as new:
                for sent in sentences:
                    new.write(sent+'.\n')
def bawe_file():
    with open('new_corpus.txt', 'w', encoding='utf-8') as new:

        for filename in os.listdir('../CORPUS_TXT'):
            with open('../CORPUS_TXT/' + filename) as text:
                ok_text = BeautifulSoup(text.read(), "lxml").get_text()
                sentences = ok_text.split('.')
                for sent in sentences:
                    new.write(sent+'.\n')
Ejemplo n.º 24
0
def get_latest_articles():
    response = requests.get(ARXIV_URL)
    tree = ElementTree.fromstring(response.content)
    articles = []
    for article_xml in tree.findall('{http://purl.org/rss/1.0/}item'):

        article = Article()

        # Parse title
        article.title, info = article_xml.find('{http://purl.org/rss/1.0/}title').text.strip()[:-1].rsplit('(',1)

        # We don't want updates
        if "UPDATED" in info:
            continue

        # Parse out identifier and categories
        article.identifier, article.subject = info.split()[0:2]

        # Clean up identifier
        article.identifier = article.identifier.replace('arXiv:', '').split('v')[0]

        # Parse authors
        authors = BeautifulSoup(article_xml.find('{http://purl.org/dc/elements/1.1/}creator').text, "html.parser").getText()

        # Clean up authors, by removing affiliations in potentially nested
        # parentheses.
        while True:
            start = None
            end = None
            paren = 0
            for i in range(len(authors)):
                if authors[i] == '(':
                    paren += 1
                    if paren == 1:
                        start = i
                if authors[i] == ')':
                    paren -= 1
                    if paren == 0:
                        end = i
                        break
            if start is None:
                break
            else:
                authors = authors[:start].strip() + " " + authors[end+1:].strip()

        article.authors = authors.split(', ')

        # Parse main text
        article.text = BeautifulSoup(article_xml.find('{http://purl.org/rss/1.0/}description').text, "html.parser").getText().strip()

        # Remove dollar signs
        article.text = article.text.replace('$', '')

        articles.append(article)

    return articles
Ejemplo n.º 25
0
def yql_real(tick, attempts):
	p = []
	p.append(tick)
	p.append(attempts)
    # Web Scrapping
	try:
		req = Request(
			'http://finance.yahoo.com/d/quotes.csv?s=' + tick + '&f=b2b3c6ej3m2r2j1',
			data=None,
			headers={
				   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
			}
		)
		html = urlopen(req)
		data = html.read()

	        # Parsing
		soup = BeautifulSoup(data, 'html.parser')
	except URLError as e:
		writelog('[CRITICAL] URL ERROR Encountered' + str(e), 'yql_real', p)
		return 1
	except HTTPError as e:
		writelog('[WARNING] HTTP ERROR Encountered ' + str(e), 'yql_real', p)
		return 1
	except http.client.IncompleteRead as e:
                writelog('[WARNING] HTTP INCOMPLETE ERROR', 'yql_real', p)
                if (attempts < 3):
                        r = yql_growth(tick, attempts + 1)
                else:
                        writelog('[CRITICAL] HTTP INCOMPLETE ERROR AFTER 3 TRIES', 'yql_real', p)
                        return 1

                if (r == 0):
                        return 0
                else:
                        writelog('[CRITICAL] HTTP INCOMPLETE READ ERROR - Unable to resolve', 'yql_real', p)
                        return 1
	
	# Remove subscripts
	for tag in soup.find_all('sup'):
		tag.replaceWith('')
	
	soup = str(soup)
	ts = soup.split(',')
	
	# Delete Row
	dquery = 'DELETE FROM yql_real WHERE tick = \'' + tick + '\''
	dbquery(dquery)

	# Insert Row
	iquery = 'INSERT INTO yql_real (tick, ask, bid, rchange, es, marketcap, dayr, pe, smc) VALUES (\'' + tick + '\','
	for ele in ts:
		iquery = iquery + '\'' + ele + '\', '
	iquery = iquery[:-2] + ')'
	dbquery(iquery)
	return 0
Ejemplo n.º 26
0
def preproc(review, use_stopwords=False):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    if use_stopwords:
        stops = set(nltk.stopwords.words("english"))
        words = [w for w in review_text.split() if not w in stops]
        return " ".join(words)

    return review_text.lower()
def cleanUpText(review):
    #this is by no means exauhstive
    punctuation = """.,?!:;(){}[]"""
    #remove html tags
    review_text = BeautifulSoup(review).get_text()
    #replace '\n' with ''
    review_text = review_text.replace('\n', '')
    #treat punctuation as a individual word
    for c in punctuation:
        review_text = review_text.replace(c," %s "%c)

    return review_text.split()
Ejemplo n.º 28
0
def url_title(message, user, target, text):
    match = url_regex.match(text)
    if match:
        if not chanconfig.get_config_key(target, "url"):
            return
        url = match.group(0)
        title = BeautifulSoup(requests.get(url).content).title.string
        title = " ".join(title.split())
        if len(title) > 200:
            title = title[:200] + "..."

        message.client.say(target, "{}: {}".format(user.nick, title))
Ejemplo n.º 29
0
 def set(self, **kwargs):
     if not kwargs.get("set", False):
         kwargs = self.preprocess(**kwargs)
     self.subtitle = kwargs["subtitle"]
     self.content = kwargs["content"]
     self.custom_div = kwargs["custom_div"]
     self.word_count = 0
     if self.content:
         words = BeautifulSoup(self.content, "html.parser").get_text()
         self.word_count = len(words.split())
     super(Text, self).set(**kwargs)
     self.postprocess(**kwargs)
Ejemplo n.º 30
0
date = ['201810170CGY', '201810270BOS']  #length of 14

game_list = []

for dates in date:
    url = f"https://www.hockey-reference.com/boxscores/{dates}.html#all_scoring"
    games_html = urlopen(url)
    #games_html = open(f"C:\\Users\\dbge\\OneDrive - Chevron\\Random\\{dates}.html")
    games_soup = BeautifulSoup(games_html, 'lxml')

    table = games_soup.find('table')
    rows = table.findAll('tr')
    str_cells = str(rows)
    cleantext = BeautifulSoup(str_cells, 'lxml').get_text()

    s = cleantext.split(',')

    for i in range(1, len(s)):
        s[i] = s[i].replace("\t", "")

    df = pd.DataFrame(s)

    df[0] = df[0].str.split('\n')
    tags = df[0].apply(pd.Series)
    tags = tags.rename(columns=lambda x: 'tag_' + str(x))
    df = pd.concat([df[:], tags[:]], axis=1)
    df_width = len(df.columns)

    cleanedDF = PPandRegularGoals(df, dates)
    game_list.append(cleanedDF)
Ejemplo n.º 31
0
                #VALORI DALLA PAGINA DETTAGLIO
                if 1:
                    r = requests.get(base_url + link_dettaglio, timeout=5)
                    responce = r.content
                    soup_dettaglio = BeautifulSoup(responce)

                    box = soup_dettaglio.find_all(
                        "div", attrs={"class": "padding_box"})
                    righe = str(box[0]).split('<br/>')
                    lista_scorporabile = []
                    for riga in righe:
                        soup_contenuto = BeautifulSoup(riga).get_text().strip()

                        provenienza = 'SCP'
                        if 'CIG:' in soup_contenuto:
                            cig = soup_contenuto.split(':')[1].strip()
                            identificativo_gara = cig + '_' + provenienza
                        if 'Descrizione' in soup_contenuto:
                            oggetto = soup_contenuto.split(
                                ':')[1].strip().replace('"', "'")
                            oggetto = pulisci(oggetto)

                        if 'Procedura' in soup_contenuto:
                            procedura = soup_contenuto.split(
                                ':')[1].strip().replace('"', "'")

                        if 'Denominazione' in soup_contenuto:
                            ente = soup_contenuto.split(
                                ':')[1].strip().replace('"', "'")
                            ente = pulisci(ente)
Ejemplo n.º 32
0
def review_to_wordlist(review, remove_stopwords=False):
    clean_sentence = []
    review = review.lower()
    review = review.replace("# ", "#")
    review = review.replace("@ ", "@")
    review = review.replace(" _ ", "_")
    review = review.replace(" __ ", "")
    review = review.replace("__ ", "__")
    review = review.replace("_ ", "_")
    review = review.replace(' ’ s ', ' is ')
    review = review.replace(' ’ m ', ' am ')
    review = review.replace(' ’ re ', ' are ')
    review = review.replace("’ ll", 'will')
    review = review.replace("i'm", 'i am')
    review = review.replace("you'll", 'you will')
    review = review.replace("don't", 'do not')
    review = review.replace("can't", "can not")
    review = review.replace("it's", "it is")
    review = review.replace("she's", "she is")
    review = review.replace("let's", "let us")
    review = review.replace("i'll", "i will")
    review = review.replace("haven't", "have not")
    review = review.replace("doesn't", "does not")
    review = review.replace("he's", "he is")
    review = review.replace("doesn ’ t", "does not")
    review = review.replace("didn ’ t", "did not")
    review = review.replace("i ’ ve", "i have")
    review = review.replace("we'll", "we will")
    review = review.replace("i ’ d", "i had")
    review = review.replace("won ’ t", "would not")
    review = review.replace("we ’ ve", "we have")
    review = review.replace("you ’ ve", "you are")
    review = review.replace("ain ’ t", "are not")
    review = review.replace("y ’ all", "you and all")
    review = review.replace("couldn ’ t", "could not")
    review = review.replace("haven ’ t", "have not")
    review = review.replace("aren't", "are not")
    review = review.replace("you ’ d", "you had")
    review = review.replace("that's", "that is")
    review = review.replace("wasn't", "was not")
    review = review.replace("he'll", "he will")
    review = review.replace("ma ’ am", 'madam')
    review = review.replace("ma'am ", "madam")
    review = review.replace("they ’ ve", "they have")
    review = review.replace('don ’ t', 'do not')
    review = review.replace('can ’ t', 'can not')
    review = review.replace('isn ’ t', 'is not')
    review = review.replace("b'day", 'birthday')
    review = review.replace("I've", 'I have')
    review = review.replace("didn't", "did not")
    review = review.replace("u're", "you are")
    review = review.replace("What's", 'what is')
    review = review.replace("you're", 'you are')
    review = review.replace("You're", 'you are')
    review = review.replace("I'm", 'I am')
    review = review.replace("isn't", "is not")
    review = review.replace(" ___", "___ ")
    review = review.replace("won't", 'will not')
    review = review.replace('can ’ t', 'can not')
    review = review.replace('I ’ ll ', 'I will')
    review = review.replace("we ’ ll", 'we will')
    review = review.replace("didn ’ t", 'did not')
    review = review.replace(" u ", ' you ')
    review = review.replace("wasn ’ t", 'was not')
    review = review.replace(' ’ s ', ' is ')
    review = review.replace(' ’ m ', ' am ')
    review = review.replace(' ’ re ', ' are ')
    review = review.replace("’ ll", 'will')
    review = review.replace('don ’ t', 'do not')
    review = review.replace('can ’ t', 'can not')
    review = review.replace('isn ’ t', 'is not')
    review = review.replace("I've", 'I have')
    review = review.replace("What's", 'what is')
    review = review.replace("you're", 'you are')

    review = review.replace("You're", 'you are')
    review = review.replace("I'm", 'I am')
    review = review.replace("won't", 'will not')
    review = review.replace('can ’ t', 'can not')
    review = review.replace("we ’ ll", 'we will')
    review = review.replace("didn ’ t", 'did not')
    review = review.replace(" u ", ' you ')
    review = review.replace("wasn ’ t", 'was not')

    # review = review.replace(' ’ re', 'are')
    review = review.replace('+', 'and')
    review_text = BeautifulSoup(review, "lxml").get_text()
    # review_text = _slang_loopup(review_text)
    review_text = ''.join(''.join(s)[:2]
                          for _, s in itertools.groupby(review_text))
    # print(review_text)
    review_text = review_text.split()
    for i in review_text:
        if i.startswith("@"):
            i = 'user'
            i.startswith("")
        if i.startswith("https"):
            continue
        if i.startswith("RT"):
            continue
        else:
            clean_sentence.append(i)
    review_text = ' '.join(str(i) for i in clean_sentence)

    review_text = re.sub("[^a-zA-Zn?!.]", " ", review_text)

    words = review_text.lower().split()
    orig_rev = ' '.join(words).lower()

    return (orig_rev)
Ejemplo n.º 33
0
import collections

language = "english"
search = "I"
f = codecs.open("index.html", 'r', 'utf-8')


def frequency(arr):
    return collections.Counter(arr)


# the html parser is specified to ensure uniformity in all systems
document = BeautifulSoup(f.read(), features="html.parser").get_text()

#removing duplicated spaces and puntctiontions
text = " ".join(document.split())
text = text.translate(str.maketrans('', '', string.punctuation))

#unfortunate this quotes resisted
text = text.replace('”', '')
text = text.replace(' “', '')

stop_words = set(stopwords.words(language))

word_tokens = word_tokenize(text)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
# print (filtered_sentence)

freq = frequency(word_tokens)
for key, value in freq.items():
Ejemplo n.º 34
0
        return self._stemmer.stem(word).lower()


grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie')
#lemmatizer - making sure resulting word is in the dictionary then remove affixes
wnl = nltk.WordNetLemmatizer()
print([wnl.lemmatize(t) for t in raw_tokens])

#simple approaches to tokenizing text
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""
#easiest approach is to split by ' ', leaves out \n and tabs
print(raw.split(' '))
print(re.split(r'[ \t\n]+', raw))  #notice the space
print(re.split(r'\s+', raw))  #includes any white space character
print(re.split(r'\w+', raw))  #try 'xx'.split('x')
print(re.findall(r'\w+', raw))  #why does this happen?
print(
    re.split(r'\W+', raw)
)  #complement of \w, all characters other than letters, digits and underscores
print(
    re.findall(r'\w+|\S\w*', raw)
)  #first match sequence of word chars, if no match try to match any non-whitespace character(complement of \s) followed by other word characters
print(
    re.findall(r'\w+([-\']\w+)*', raw)
)  #permit word internal hyphens and apostrophes, this expression means \w+ followed by zero or more instances of [-']\w+
print(re.findall(r'\w+(?:[-\']\w+)*', raw))
print(
Ejemplo n.º 35
0
      ><span style="font-size: 11px; color: #ff8400">opinions – </span
      ><span style="font-size: 11px; color: #ff8400">enthusiast – </span
      ><span style="font-size: 11px; color: #ff8400">content – </span
      ><span style="font-size: 10px; color: #ff8400">founder – </span
      ><span style="font-size: 10px; color: #ff8400">building – </span
      ><span style="font-size: 10px; color: #ff8400">gwent – </span
      ><span style="font-size: 10px; color: #ff8400">speaker – </span
      ><span style="font-size: 9px; color: #ff8400">student – </span
      ><span style="font-size: 9px; color: #ff8400">react – </span
      ><span style="font-size: 9px; color: #ff8400">tweets</span>
      <p></p>
      <hr />
      <p></p>
      <b>Two word bio cloud</b>
      <p></p>
      <span style="font-size: 60px; color: #ff8400">software engineer – </span
      ><span style="font-size: 27px; color: #ff8400">web developer – </span
      ><span style="font-size: 21px; color: #ff8400">software developer – </span
      ><span style="font-size: 20px; color: #ff8400">frontend developer – </span
      ><span style="font-size: 19px; color: #ff8400">content creator – </span
      ><span style="font-size: 15px; color: #ff8400">web dev – </span
      ><span style="font-size: 15px; color: #ff8400">full stack – </span
      ><span style="font-size: 10px; color: #ff8400">web development – </span
      ><span style="font-size: 9px; color: #ff8400">html css</span>
    </div>
    """
soup = BeautifulSoup(path, features="lxml").get_text(
    strip=True)  # .find_all("div", {"class": "slice_body_1"})
span = soup.split('–')
print(soup)
Ejemplo n.º 36
0
 def extract_url(self, url):
     response = requests.get(url)
     title = BeautifulSoup(response.content).title.text
     title_url = "http://%s" % title.split("://")[1].strip()
     return title_url
def crawl_cveid_list():
    link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=1&hasexp=0&opdos=1&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=21357&sha=38745b427397c23f6ed92e0ed2d3e114da828672'
    max_page_idx_list = []
    for i in range(13):
        cat_list = ['0'] * 13
        cat_list[i] = '1'
        dos, execution, overflow, memc, sqli, xss, dirtra, httprs, bypass, infor, gainpre, csrf, fileinc = cat_list
        page_num = 1
        link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=' + str(
            page_num) + '&hasexp=0&opdos=' + dos + '&opec=' + execution + '&opov=' + overflow + '&opcsrf=' + csrf + '&opgpriv=' + gainpre + '&opsqli=' + sqli + '&opxss=' + xss + '&opdirt=' + dirtra + '&opmemc=' + memc + '&ophttprs=' + httprs + '&opbyp=' + bypass + '&opfileinc=' + fileinc + '&opginf=' + infor + '&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=28068&sha=0ea5fbc52190c28f2a1c51aca205b315bc4c6509'
        page = requests.get(link, timeout=60, headers={'User-Agent': "Magic Browser"})
        print(link)
        # print(dos, ec, ov, csrf, gpriv, sqli, xss, dirt, memc, httprs, byp, fileinc, inf)
        content = BeautifulSoup(page.content).get_text()

        keyword_section = content.replace('\n', ' ')

        loc_1 = keyword_section.find('This Page)')
        loc_2 = keyword_section.find('How does it work? ')
        max_page_idx = keyword_section[loc_1 + 10:loc_2].split('   	')[0].strip().split()[-1]
        print(max_page_idx)
        max_page_idx_list.append(max_page_idx)

    name_cat = ['dos', 'execution', 'overflow', 'memc', 'sqli', 'xss', 'dirtra', 'httprs',  'bypass', 'infor', 'gainpre', 'csrf',  'fileinc' ]
    sha_value_cat = ['38745b427397c23f6ed92e0ed2d3e114da828672',
                 '0ea5fbc52190c28f2a1c51aca205b315bc4c6509',
                 '363372bbc3616054065946a39f4fa589eb5f0f04',
                 '5829c45b747ab5143004640f312c7f72e5b102db',
                 '1b24fccb15090079e49c0131be821c96dc2f001c',
                 'e3bb5586965f5a13bfaa78233a10ebc3f9606d12',
                 '69098b0b30799b9520bf468c7bc060a7f756abf9',
                 'd5623136f5150876a7dfba54b38fc96fe135993c',
                 '7c71486574161a851e392e2e9dcdfea2cde521c3',
                 '1f368a2d3fc25689cc46e4dcb206b4d6103aaab7',
                 '2f1f77e26ecf09cf8b4f251b1efc2b4bcad02050',
                 'e2c3963a5b4ac67dc5dc9fe39ff95f846162e52d',
                 '4160b1b268ed8bcd97bdd927802ef4922995d3d2']
    CVE_id_list_by_cat = []
    try:
        for cat_idx in range(13)[0:]:
            cat_list = ['0'] * 13
            cat_list[cat_idx] = '1'
            sha_value = sha_value_cat[cat_idx]
            dos, execution, overflow, memc, sqli, xss, dirtra, httprs, bypass, infor, gainpre, csrf, fileinc = cat_list
            # cat_list[cat_idx] = '1'
            max_page_num = int(max_page_idx_list[cat_idx])
            print('crawling the CVE ids in the ' + str(cat_idx) + ' category...')
            CVE_id_list_this_cat = []

            page_num = 1
            cve_cnt = 0
            while page_num <= max_page_num:
                link = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=0&product_id=0&version_id=0&page=' + str(
                    page_num) + '&hasexp=0&opdos=' + dos + '&opec=' + execution + '&opov=' + overflow + '&opcsrf=' + csrf + '&opgpriv=' + gainpre + '&opsqli=' + sqli + '&opxss=' + xss + '&opdirt=' + dirtra + '&opmemc=' + memc + '&ophttprs=' + httprs + '&opbyp=' + bypass + '&opfileinc=' + fileinc + '&opginf=' + infor + '&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=1&trc=28068&sha=' + sha_value
                page = requests.get(link, timeout=60, headers={'User-Agent': "Magic Browser"})
                print('category ' + str(cat_idx) + ', page ' + str(page_num) + ', cve count ' + str(cve_cnt), link)
                content = BeautifulSoup(page.content).get_text()
                content_lines_list = content.split('\n')
                for line in content_lines_list:
                    if line.startswith('CVE-'):
                        CVE_id_list_this_cat.append(line.strip())
                        cve_cnt += 1
                page_num += 1
            CVE_id_list_by_cat.append(CVE_id_list_by_cat)

            f_cve_id_cat_file = open('../data/cvedetails_dict/cvedetails_dict' + name_cat[cat_idx], 'w')
            idx = 1
            for cve in CVE_id_list_this_cat:
                f_cve_id_cat_file.write(str(idx) + '\t' + cve + '\n')
                idx += 1

        print(CVE_id_list_by_cat)

    except requests.exceptions.HTTPError as errh:
        print("Http Error: " + str(errh) + " Please check: " + link)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:" + str(errc) + " Please check: " + link)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:" + str(errt) + " Please check: " + link)
    except requests.exceptions.RequestException as err:
        print("Other errors!" + str(err) + " Please check: " + link)
Ejemplo n.º 38
0
import requests
from bs4 import BeautifulSoup

usrprice = int(input("Enter buy-in price: \n"))
while True:
    try:
        lst = []
        url = 'https://data.gateio.life/api2/1/ticker/btc_usdt'
        res = requests.get(url)
        html_page = res.content
        soup = BeautifulSoup(html_page, 'html.parser').text
        soup.split(',')
        lst.append(soup)
        parts = soup.split(',')
        lask = parts[5]
        try1 = float(lask[13:20])
        try2 = float(lask[13:18])
        print("Percent Change: ",float(((float(try1)) - float(usrprice)) / usrprice)*100,"%")
        print("Current Price: ",try1,"$")
        print(" ")
    except Exception as e:
        print("Percent Change: ",float(((float(try1)) - float(usrprice)) / usrprice)*100,"%")
        print("Current Price: ",try2,"$")
        print(" ")
    if try1 or try2 != float:
        continue
def main():

    # webdriver options
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')

    # initialize webdriver
    driver = webdriver.Chrome(executable_path=binary_path, options=options)
    driver.get(URL)

    # initialize csv
    outfile = open('reviews.csv', 'w', newline='')
    writer = csv.writer(outfile)
    writer.writerow(["author", "rating", "date", "review"])

    for x in range(PAGES):
        # Wait 4 seconds for the DOM to update
        time.sleep(4)
        # Grab new DOM
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # Each user review
        reviews_selector = soup.find_all('div', class_='profile-user-review')
        for review_selector in reviews_selector:

            # Grab all data
            author = review_selector.find('span', attrs={
                'itemprop': 'author'
            }).get_text()
            rating = review_selector.find('span',
                                          attrs={
                                              'itemprop': 'reviewRating'
                                          }).get_text()
            date = review_selector.find('span',
                                        attrs={
                                            'itemprop': 'datePublished'
                                        }).get_text()
            review = review_selector.find('span',
                                          attrs={
                                              'itemprop': 'reviewBody'
                                          }).get_text()

            # Clean whitespace
            author = ''.join(content.strip() for content in author.split('/n'))
            rating = ''.join(content.strip() for content in rating.split('/n'))
            date = ''.join(content.strip() for content in date.split('/n'))
            review = BeautifulSoup(
                ''.join(content.strip() for content in review.split('/n')),
                'html.parser')

            # Save all data
            writer.writerow([author, rating, date, review])
        print("✅ Page " + str(x + 1) + "'s Page reviews saved\n")
        try:
            # Find next button on reviews
            nextButton = driver.find_element_by_class_name("next")
            # Click next button
            nextButton.click()
            print("🖱 Mouse click to next review page\n")
        except Exception as error:
            print(error)
            print("🚨 Unable to find buttons to click next")
            sys.exit(errorCodes.unable_to_identify_button)

    print("🙌🏼 Reviews saved to reviews.csv\n")
Ejemplo n.º 40
0
        '''
        SELECT p.documentName AS docName, SUM(frequency) AS freq, GROUP_CONCAT(indexes) AS idxs
        FROM Posting p
        WHERE
            p.word  =  ?
        GROUP BY p.documentName
        ORDER BY freq DESC;
    ''', (token, ))

    for row in cursor:

        namea = row[0]
        pageContent = open(namea, 'r', encoding="utf-8").read()
        pageContent = BeautifulSoup(pageContent).find('body')
        pageContent = pageContent.text.lower()
        lines = pageContent.split("\n")
        non_empty_lines = [line for line in lines if line.strip() != ""]
        pageContent = ""
        for line in non_empty_lines:
            pageContent += line + "\n"
        text_tokens = word_tokenize(pageContent)
        inde = ""
        position = []
        repeat = []
        lila = []
        i = 0
        j = 0
        x = 0
        for coun in text_tokens:
            for coun2 in text_tokens:
                if (coun == coun2):
Ejemplo n.º 41
0
def in_avtor(url_adress):
    html = urlopen(url_adress).read().decode('cp1251')

    baze_parsing = {}  # Список по странице
    key_str = [
        'URL', 'ФИО', 'Название', 'Aдpeс', 'WWW', 'Родился', 'Живет',
        'Обновлялось', 'Объем', 'Рейтинг', 'Посетителей', 'Friend', 'Страна',
        'Город', 'Кол_во', 'Кол_Оценок', 'Friend_on', 'Friend_off'
    ]

    baze_parsing['URL'] = url_adress
    try:
        bs = BeautifulSoup(html, "html.parser").h3.text
        baze_parsing['ФИО'] = bs.split(':\n')[0]
        baze_parsing['Название'] = bs.split(':\n')[1]
    except (IndexError, AttributeError):
        baze_parsing['ФИО'] = ''
        baze_parsing['Название'] = ''

    try:
        start_txt = BeautifulSoup(html, "html.parser").li.text
    except (AttributeError):
        start_txt = ('\n')

    mas = []  # Массив значений полученных с сайта
    for elem in str(start_txt).split('\n'):  # Чистим массив от лишних знаков
        if '\r' or ' ' or '\t' in elem:
            mas.append(elem.strip('\r, ,\t'))
        else:
            mas.append(elem)

    # Посмотреть другую логику (через ключи)
    for key in key_str[3:]:
        if key in " ".join(mas):
            for el in mas:
                if key in el:
                    try:
                        baze_parsing[key] = el.split(': ')[1]
                    except IndexError:
                        baze_parsing[key] = ''
                    break
        else:
            baze_parsing[key] = ''

    # ------------ Убираем знаки, сбивающие запись в SQL -------------
    if "'" or "?" in str(baze_parsing):
        for key in key_str[:]:
            if "'" in baze_parsing[key]:  # Убираем знаки '
                baze_parsing[key] = baze_parsing[key].replace("'", "_")
            if "?" in baze_parsing[key]:  # Убираем знаки '
                baze_parsing[key] = baze_parsing[key].replace("?", "_")

    # ------------- Обработка строк для загрузки в базу -------------
    if baze_parsing['Живет'] != '' or ',' in baze_parsing['Живет']:
        baze_parsing['Страна'] = baze_parsing['Живет'].split(',')[0]
        baze_parsing['Город'] = baze_parsing['Живет'].split(',')[1]
    baze_parsing['Родился'] = '-'.join(
        baze_parsing['Родился'].split('/')[::-1])
    baze_parsing['Обновлялось'] = '-'.join(
        baze_parsing['Обновлялось'].split('/')[::-1])
    if '/' in baze_parsing['Объем']:
        baze_parsing['Кол_во'] = baze_parsing['Объем'].split('/')[1]
        baze_parsing['Объем'] = baze_parsing['Объем'].split('k/')[0]
    if '*' in baze_parsing['Рейтинг']:
        baze_parsing['Кол_Оценок'] = baze_parsing['Рейтинг'].split('*')[1]
        baze_parsing['Рейтинг'] = baze_parsing['Рейтинг'].split('*')[0]
    if '/' in baze_parsing['Friend']:
        baze_parsing['Friend_on'] = baze_parsing['Friend'].split('/')[0]
        baze_parsing['Friend_off'] = baze_parsing['Friend'].split('/')[1]
    elif baze_parsing['Friend'] != '':
        baze_parsing['Friend_on'] = baze_parsing['Friend']

    try:
        cursor.execute("""INSERT INTO 'Samizdat' (
            'URL', 'ФИО', 'Название', 'Aдpeс', 'WWW', 'Родился', 'Обновлялось', 'Объем', 'Рейтинг',
            'Посетителей', 'Страна',  'Город', 'Кол_во', 'Кол_Оценок', 'Friend_on', 'Friend_off') 
            VALUES (
            '{URL:s}', '{ФИО:s}', '{Название:s}', '{Aдpeс:s}', '{WWW:s}', '{Родился:s}', '{Обновлялось:s}', '{Объем:s}', '{Рейтинг:s}', 
            '{Посетителей:s}', '{Страна:s}', '{Город:s}', '{Кол_во:s}','{Кол_Оценок:s}', '{Friend_on:s}', '{Friend_off:s}'
            )""".format(**baze_parsing))

        SQL_Connect.commit()  # Применение изменений к базе данных
    except sqlite3.Error as e:
        print(e, '---------->', baze_parsing['ФИО'], baze_parsing['URL'])
        try:
            cursor.execute(
                "INSERT INTO 'URL_Error' ('URL') VALUES ('{:s}')".format(
                    baze_parsing['URL']))
            SQL_Connect.commit()  # Применение изменений к базе данных
        except:
            print(e, '--------------------->', baze_parsing['URL'],
                  '<------------------')
    return baze_parsing['ФИО']
Ejemplo n.º 42
0
csv_writer.writeheader()
for row in csv_reader:
    title = row['title'].strip().lower()
    raw_content = row['raw_content']
    clean_content = BeautifulSoup(raw_content, 'lxml').text
    content = []
    labels = []

    # Compile regex to remove non-alphanum char
    nonalpha = re.compile('[^a-z\-]+')
    for word in title.split(' '):
        lower_word = word.lower()
        clean_word = nonalpha.sub('', lower_word)
        if clean_word != '':
            content.append(clean_word)
    for word in clean_content.split(' '):
        lower_word = word.lower()
        clean_word = nonalpha.sub('', lower_word)
        if clean_word != '':
            content.append(clean_word)
    content_str = ' '.join(content).strip()

    if row['sentiment_1'] != '':
        label = row['sentiment_1']
        labels.append(label)
    if row['sentiment_2'] != '':
        label = row['sentiment_2']
        labels.append(label)
    if row['sentiment_3'] != '':
        label = row['sentiment_3']
        labels.append(label)
Ejemplo n.º 43
0
    def craw_detail(self, url, headers, cookies, it):
        time.sleep(random.uniform(1, 3000) / 1000)
        searchid = file_utils.url_parse(url, "SEARCHID")
        #print(url)
        session = requests.session()
        response_list = session.get(url, headers=headers, cookies=cookies)
        cookies = requests.utils.dict_from_cookiejar(session.cookies)
        headers = response_list.headers
        #print(response_list.text)
        response_list.close()
        if response_list.status_code == 200 and 'System error happened' not in response_list.text:
            title = ""
            accession_number = ""
            source_title = ""
            language = ""
            document_type = ""
            abstract = ""
            number_of_references = ""
            main_heading = ""
            controlled_terms = ""
            uncontrolled_terms = ""
            classification_code = ""
            doi = ""
            database = ""
            conference_name = ""
            conference_date = ""
            conference_location = ""
            conference_code = ""
            mumerical_data_indexing = ""
            affiliation_no = ""
            author_affiliation = ""
            affiliation_organization = ""
            country = ""
            authors = ""
            affiliation_no = ""
            e_mail = ""
            funding_number = ""
            funding_acronym = ""
            funding_sponsor = ""
            source_title = ""
            abbreviated_source_title = ""
            issn = ""
            e_issn = ""
            coden = ""
            isbn_13 = ""
            article_number = ""
            issue = ""
            volume = ""
            part_number = ""
            issue_title = ""
            issue_date = ""
            publication_year = ""
            page_begin = ""
            page_end = ""
            publisher = ""
            referance_no = ""
            referance_title = ""
            referance_authors = ""
            referance_source = ""

            list_json = response_list.json()
            results = list_json["results"]
            docindex = results[0].get("doc").get("hitindex")
            docid = results[0].get("doc").get("docid")

            # abstracthref = results[0]["abstracthref"].replace("\n","").replace(" ","")
            time.sleep(random.uniform(1, 3000) / 1000)
            abstracthref = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                docindex
            ) + "&ignore_docid=" + docid + "&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes"
            #session = requests.session()
            # response = session.get(self.basd_url+abstracthref,headers=headers,cookies=cookies)
            headers["Content-Type"] = "application/json"
            # headers["Connection"] = "keep-alive"
            # headers["Referer"] = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID="+searchid+"&DOCINDEX="+str(docindex)+"&ignore_docid="+docid+"&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes"

            # abstract_response = session.get(abstracthref, headers=headers, cookies=cookies)
            # print(abstract_response.text)
            # abstract_json = abstract_response.json()
            # title = BeautifulSoup(abstract_json.get("abstractDetail_highlight_terms_map").get("title"),"lxml").text

            # ------------------------------------------------------detailed----------------------------------------------------------
            time.sleep(random.uniform(1, 3000) / 1000)
            detailedhref = "https://www.engineeringvillage.com/search/doc/detailed.url?content=true&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                docindex
            ) + "&database=1&pageType=expertSearch&searchtype=Expert&dedupResultCount=null&format=expertSearchDetailedFormat&usageOrigin=recordpage&usageZone=abstracttab"
            session = requests.session()
            detailed_response = session.get(detailedhref,
                                            headers=headers,
                                            cookies=cookies)
            #print(detailed_response.text)
            detailed_response.close()
            if detailed_response.status_code == 200:
                detailed_json = detailed_response.json()
                #print(detailed_json)
                detailed_result = detailed_json.get("result")
                title = BeautifulSoup(
                    detailed_json.get("result").get("title"),
                    "lxml").text.replace("'", "\\'").replace('"', '\\"')
                accession_number = detailed_result.get("accnum")
                author_affiliations = detailed_result.get("affils")
                source_title = detailed_result.get("ril")
                language = detailed_result.get("la")
                document_type = detailed_result.get("doctype")
                abstract = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("abstractRecord"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "abstractRecord") is not None else ''
                number_of_references = detailed_result.get(
                    "abstractrecord").get("refcount")
                main_heading = ''
                if detailed_result.get("abstractrecord") is not None:
                    if detailed_result.get("abstractrecord").get(
                            "termmap") is not None:
                        if detailed_result.get("abstractrecord").get(
                                "termmap").get("MH") is not None:
                            main_heading = detailed_result.get(
                                "abstractrecord").get("termmap").get(
                                    "MH")[0].get("value")
                controlled_terms = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("CVS"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "CVS") is not None else ''
                uncontrolled_terms = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("FLS"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "FLS") is not None else ''
                # 具体解析
                classification_code_tmp = detailed_result.get(
                    "abstractrecord").get("classificationcodes").get(
                        "Classification code")
                if classification_code_tmp is not None and len(
                        classification_code_tmp) > 0:
                    for cc in classification_code_tmp:
                        classification_code = classification_code + cc.get(
                            "id") + cc.get("title") + " - "
                    classification_code = classification_code.rstrip(' - ')
                doi = detailed_result.get("doi")
                data_base = detailed_result.get("doc").get("dbname")
                conference_name = BeautifulSoup(
                    detailed_result.get("cf"), "lxml"
                ).text if detailed_result.get("cf") is not None else ''
                conference_date = detailed_result.get(
                    "md") if detailed_result.get("md") is not None else ''
                conference_location = detailed_result.get(
                    "ml") if detailed_result.get("ml") is not None else ''
                conference_code = BeautifulSoup(
                    detailed_result.get("cc"),
                    "lxml").text.replace("\n", "").replace(
                        "\t",
                        "") if detailed_result.get("cc") is not None else ""
                mumerical_data_indexing = detailed_result.get(
                    "ndi") if detailed_result.get("ndi") is not None else ''

                sqls = []
                # ei_thesis_thesis
                tt_cauthors = detailed_result.get("cauthors")
                corresponding_author = ""
                corresponding_author_email = ""
                if tt_cauthors is not None and len(tt_cauthors) > 0:
                    for cauthor in tt_cauthors:
                        corresponding_author = corresponding_author + cauthor.get(
                            "name") + ";"
                        corresponding_author_email = corresponding_author_email + (
                            (cauthor.get("email") + ";")
                            if cauthor.get("email") != '' is not None else '')
                id = str(
                    uuid.uuid3(uuid.NAMESPACE_DNS, title + accession_number))
                sql = "REPLACE INTO ei_thesis_thesis(id,title,accession_number,source_title,language,document_type,abstract,number_of_references,main_heading,controlled_terms,uncontrolled_terms,classification_code,doi,data_base,conference_name,conference_date,conference_location,conference_code,mumerical_data_indexing,corresponding_author,corresponding_author_email) " \
                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + language + "','" + document_type + "','" + abstract.replace("'", "\\'").replace('"', '\\"') + "','" + str(number_of_references) + "','" + main_heading + "','" + controlled_terms.replace("'", "\\'").replace('"', '\\"') + "','" + uncontrolled_terms.replace("'", "\\'").replace('"', '\\"') + "','" + classification_code + "','" + doi + "','" + data_base + "','" + conference_name.replace("'", "\\'").replace('"', '\\"') + "','" + conference_date + "','" + conference_location.replace("'", "\\'").replace('"', '\\"') + "','" + conference_code + "','" + mumerical_data_indexing + "','" + corresponding_author.replace("'", "\\'").replace('"', '\\"') + "','" + corresponding_author_email + "')"
                sqls.append(sql)

                # ei_thesis_affiliation
                if author_affiliations is not None and len(
                        author_affiliations) > 0:
                    for af in author_affiliations:
                        author_affiliation = BeautifulSoup(
                            af.get("name"),
                            "lxml").text if af.get("name") is not None else ''
                        aocs = author_affiliation.split(",")
                        affiliation_organization = ''
                        country = ''
                        if len(aocs) == 5:
                            affiliation_organization = aocs[-3]
                            country = aocs[-1]
                        elif len(aocs) == 4:
                            affiliation_organization = aocs[-3]
                            country = aocs[-1]
                        elif len(aocs) == 3:
                            affiliation_organization = aocs[-2]
                            country = aocs[-1]
                        id = str(
                            uuid.uuid3(
                                uuid.NAMESPACE_DNS,
                                title + accession_number + str(af.get("id"))))
                        sql = 'REPLACE INTO ei_thesis_affiliation(id,title,accession_number,affiliation_no,author_affiliation,affiliation_organization,country)  ' \
                              'VALUES ("' + id + '","' + title + '","' + accession_number + '","' + str(af.get("id")) + '","' + author_affiliation + '","' + affiliation_organization + '","' + country + '")'
                        sqls.append(sql)

                    # ei_thesis_author
                    authors = detailed_result.get("authors")
                    cauthors = detailed_result.get("cauthors")
                    if authors is not None and len(authors) > 0:
                        for au in authors:
                            affiliation_no = au.get("id")
                            author = au.get("name")
                            e_mail = au.get("email")
                            corresponding_author = '0'
                            if cauthors is not None and len(cauthors) > 0:
                                for cauthor in cauthors:
                                    if author == cauthor.get("name"):
                                        corresponding_author = "1"
                            id = str(
                                uuid.uuid3(uuid.NAMESPACE_DNS,
                                           title + accession_number + author))
                            sql = "REPLACE INTO ei_thesis_author(id,title,accession_number,author,affiliation_no,e_mail) " \
                                  "VALUES ('"+id+"','"+title+"','"+accession_number+"','"+author.replace("'", "\\'").replace('"', '\\"')+"','"+str(affiliation_no)+"','"+e_mail+"')"
                            sqls.append(sql)

                    # ei_thesis_funding
                    funding_details = detailed_result.get(
                        "abstractrecord").get("fundingDetails")
                    if funding_details is not None and len(
                            funding_details) > 0:
                        for fd in funding_details:
                            id = str(
                                uuid.uuid3(
                                    uuid.NAMESPACE_DNS,
                                    title + accession_number +
                                    str(fd.get("fundingId"))))
                            sql = "REPLACE INTO ei_thesis_funding(id,title,accession_number,funding_number,funding_acronym,funding_sponsor) " \
                                  "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(fd.get("fundingId")) + "','" + fd.get("fundingAcronym") + "','" + fd.get("fundingAgency").replace("'", "\\'").replace('"', '\\"') + "')"
                            sqls.append(sql)

                    # ei_thesis_publication
                    abbreviated_source_title = detailed_result.get(
                        "sourceabbrev")
                    issn = detailed_result.get("citedby").get(
                        "issn") if detailed_result.get("citedby").get(
                            "issn") is not None else ''
                    e_issn = detailed_result.get("abstractrecord").get(
                        "eissn") if detailed_result.get("abstractrecord").get(
                            "eissn") is not None else ''
                    if e_issn is not None and e_issn != '':
                        e_issn = e_issn[0:4] + "-" + e_issn[4:len(e_issn)]
                    coden = detailed_result.get("abstractrecord").get(
                        "coden") if detailed_result.get("abstractrecord").get(
                            "coden") is not None else ''
                    isbn_13 = detailed_result.get(
                        "isbn13") if detailed_result.get(
                            "isbn13") is not None else ''
                    article_number = detailed_result.get(
                        "articlenumber") if detailed_result.get(
                            "articlenumber") is not None else ''
                    issue = detailed_result.get("citedby").get("firstissue")
                    volume = detailed_result.get("vo")
                    part_number = detailed_result.get(
                        "cfpnum") if detailed_result.get(
                            "cfpnum") is not None else ''
                    issue_title = detailed_result.get("mt").replace(
                        "::H:", ":H::")
                    issue_date = detailed_result.get("sd")
                    publication_year = detailed_result.get("yr")
                    pages = detailed_result.get("pages")
                    page_begin = ""
                    page_end = ""
                    pages_split = pages.split("-")
                    if len(pages_split) == 2:
                        page_begin = pages_split[0]
                        page_end = pages_split[1]
                    publisher = detailed_result.get("pn").replace(
                        "::H:", ":H::")
                    id = str(
                        uuid.uuid3(uuid.NAMESPACE_DNS,
                                   title + accession_number))
                    sql = "REPLACE INTO ei_thesis_publication(id,title,accession_number,source_title,abbreviated_source_title,issn,e_issn,coden,isbn_13,article_number,issue,volume,part_number,issue_title,issue_date,publication_year,page_begin,page_end,publisher) " \
                          "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + abbreviated_source_title.replace("'", "\\'").replace('"', '\\"') + "','" + str(issn) + "','" + str(e_issn) + "','" + str(coden) + "','" + str(isbn_13) + "','" + str(article_number) + "','" + str(issue) + "','" + volume + "','" + str(part_number) + "','" + issue_title.replace("'", "\\'").replace('"', '\\"') + "','" + issue_date + "','" + publication_year + "','" + page_begin + "','" + page_end + "','" + publisher.replace("'", "\\'").replace('"', '\\"') + "')"
                    sqls.append(sql)

                # ------------------------------------------------------Compendex Refs------------------------------------------------------
                # refs1,如果没有参考文献信息,detailed_result.get("abstractrecord").get("refcount")的值会为-1,否则就显示实际论文数
                if number_of_references != -1:
                    time.sleep(random.uniform(1, 3000) / 1000)
                    refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&refType=compendex&searchtype=Expert&usageOrigin=recordpage&usageZone=detailedtab&pageType=expertSearch&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                        docindex
                    ) + "&database=1&docid=" + docid + "&totalResultsCount=67010&displayPagination=yes&dbid=cpx"
                    session = requests.session()
                    refs_response = session.get(refshref,
                                                headers=headers,
                                                cookies=cookies)
                    #print(refs_response.text)
                    refs_response.close()
                    if refs_response.status_code == 200:
                        refs_json = refs_response.json()
                        #print(refs_json)
                        referenceBean = refs_json.get("referenceBean")
                        title_authors = referenceBean.get("results")
                        sources = referenceBean.get(
                            "resultformat_abssourcelines")
                        if title_authors is not None and len(
                                title_authors) > 0:
                            for index in range(0, len(title_authors)):
                                referance_no = index + 1
                                referance_authors = ""
                                t_authors = title_authors[index].get("authors")
                                if t_authors is not None and len(
                                        t_authors) > 0:
                                    for tau in t_authors:
                                        referance_authors = referance_authors + tau.get(
                                            "name") + ";"
                                referance_title = title_authors[index].get(
                                    "title").replace("'", "\\'").replace(
                                        '"', '\\"')
                                referance_authors = referance_authors.replace(
                                    "'", "\\'").replace('"', '\\"')
                                referance_source = BeautifulSoup(
                                    sources[index],
                                    "lxml").text.replace("'", "\\'").replace(
                                        '"', '\\"').replace('Source:  ', '')
                                id = str(
                                    uuid.uuid3(
                                        uuid.NAMESPACE_DNS, title +
                                        accession_number + referance_title))
                                sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \
                                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')"
                                sqls.append(sql)

                    # resfs2 当refs条数大于25的时候才执行这一步,不然只有一页,没有下一页
                    if number_of_references > 25:
                        time.sleep(random.uniform(1, 3000) / 1000)
                        refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&compendexajax=t&docid=" + docid + "&SEARCHID=" + searchid + "&database=1&DOCINDEX=&currPageNumber=2&searchtype=Expert&pageSize=25"
                        session = requests.session()
                        refs_response = session.get(refshref,
                                                    headers=headers,
                                                    cookies=cookies)
                        #print(refs_response.text)
                        refs_response.close()
                        refs_json = refs_response.json()
                        #print(refs_json)
                        referenceBean = refs_json.get("referenceBean")
                        title_authors = referenceBean.get("results")
                        sources = referenceBean.get(
                            "resultformat_abssourcelines")
                        if title_authors is not None and len(
                                title_authors) > 0:
                            for index in range(0, len(title_authors)):
                                referance_no = index + 1
                                referance_authors = ""
                                t_authors = title_authors[index].get("authors")
                                if t_authors is not None and len(
                                        t_authors) > 0:
                                    for tau in t_authors:
                                        referance_authors = referance_authors + tau.get(
                                            "name") + ";"
                                referance_title = title_authors[index].get(
                                    "title").replace("'", "\\'").replace(
                                        '"', '\\"')
                                referance_authors = referance_authors.replace(
                                    "'", "\\'").replace('"', '\\"')
                                referance_source = BeautifulSoup(
                                    sources[index],
                                    "lxml").text.replace("'", "\\'").replace(
                                        '"', '\\"').replace('Source:  ', '')
                                id = str(
                                    uuid.uuid3(
                                        uuid.NAMESPACE_DNS, title +
                                        accession_number + referance_title))
                                sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \
                                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')"
                                sqls.append(sql)
                #print(sqls)
                self.mysqlclient.insert_thesis_afoprt(sqls)
            else:
                self.redis_client.lpush(self.consumer_list_success_fail,
                                        json.dumps(it))
        else:
            self.redis_client.lpush(self.consumer_list_success_fail,
                                    json.dumps(it))
Ejemplo n.º 44
0
    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
Ejemplo n.º 45
0
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
train_label = pd.read_csv("labeledTrainData.tsv", delimiter="\t", quoting=3)
train_unlabel = pd.read_csv("unlabeledTrainData.tsv",
                            delimiter="\t",
                            quoting=3)
test = pd.read_csv("testData.tsv", delimiter="\t", quoting=3)

corpus = []
corpus2 = []
for i in range(0, 25000):
    review = re.sub('[^A-Za-z]', ' ', train_label['review'][i])
    review = BeautifulSoup(review).get_text()
    review = review.lower()
    review = review.split()
    review3 = re.sub('[^A-Za-z]', ' ', test['review'][i])
    review3 = BeautifulSoup(review3).get_text()
    review3 = review3.lower()
    review3 = review3.split()
    ps = PorterStemmer()

    review = [
        ps.stem(i) for i in review if not i in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

    review3 = [
        ps.stem(i) for i in review3 if not i in set(stopwords.words('english'))
    ]
Ejemplo n.º 46
0
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


#%%
for media_href in list(media_hrefs):
    
    driver.get(media_href)
    try:
        imgUrl = BeautifulSoup(
                driver.page_source
                , 'html.parser').select('.KL4Bh')[0].select('img')[0].get('srcset').split(',')[-1:][0][:-5]
        dateSrting = driver.find_element_by_tag_name('time').get_attribute('datetime')[:10]
        
        dateDirPath = '../data/' + dateSrting
        if not os.path.isdir(dateDirPath):
            os.makedirs(dateDirPath)
        print("Downloading images " + dateDirPath)
        urllib.request.urlretrieve(
                imgUrl
                , filename=os.path.join(
                        dateDirPath
                        , imgUrl.split('/')[-1:][0]))
    except:
        pass
        
print("Finished")
driver.close()
Ejemplo n.º 47
0
#!/usr/bin/python
# _*_ coding:utf-8 _*_

import nltk
import urllib
from nltk import re
from bs4 import BeautifulSoup
from urllib import request
import ssl

#Abstract the information from the url
url = input("Enter Your Website:\n")
ssl._create_default_https_context = ssl._create_unverified_context
html = request.urlopen(url).read()
raw = BeautifulSoup(html).get_text()
text_nopunct = [word for word in raw.split()]
print(raw)
raw = raw.replace(' ', '')

#Get the telephone number
pattern = re.compile(
    "((\+?55|0)\-?\s?[1-9]{2}\-?\s?[2-9]{1}\d{3,4}\-?\d{4}|(\0?\d{4,5})\s?\d{6}|\0\d{10}|(\+44|0044)\s?\d{10})"
)
result = pattern.findall(str(html))
i = 0
print("Found a Match:")

print(result[i])

#print(pattern)
Ejemplo n.º 48
0
                                         lk['href'])
                schl_detl_soup = BeautifulSoup(schl_detl.text)
                contact = schl_detl_soup.find(class_='contact')
                school_nm = contact.h4.getText('|').split('|')
                t = contact.find_all('p')[0].text.split('\n')
                text = []
                for te in t:
                    text.extend(te.split('\xa0'))
                c = {}
                for t in text:
                    if '' in t:
                        text.remove(t)
                    a = t.replace('\r', '')
                    if a == '':
                        continue
                    c.update({a.split(':')[0]: a.split(':')[1]})
                sir = contact.find_all('p')[1].getText(' ')
                sir = sir.split(':')
                e = sir[1].split(' ')[0]
                r = ''.join(sir[1].split(' ')[1:])
                sir = {sir[0]: e, r: sir[2]}
                # print(sir.split('|'))
                c.update(sir)
                c.update({'學校(中文)': school_nm[0]})
                c.update({'學校(英文)': school_nm[1]})
                c.update({'地區': link.text})
                print(c)
                break
            break
        break
def getInterfacesFromReport(method_name, dict_link, folder_path, crsr, caller):
    """
    This method recursively discovers all of the branches of invocation stemming off of the initial
    function passed into the script. It checks for cycles in the graph.
    """
    if loading_count[0] > 50:
        loading_count[0] = 0
    sys.stdout.write('\r')
    sys.stdout.write("Generating Interface Graph [%-50s]" %
                     ('=' * loading_count[0]))
    sys.stdout.flush()  #loading bar print for the console
    loading_count[0] += 1
    if len(method_name) < 1:  # edge case calls empty name
        return
    test_if_in_graph = method_name + "#" + dict_link
    if test_if_in_graph in interface_graph:  # how to check for cycles and recursion
        print(
            '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s and is already a vertx in the interface graph.'
            % (method_name, caller),
            file=cycles_log)
        return
    if not method_name[0].isalpha():
        frst = 'Non-Alpha'
    else:
        frst = method_name[0]  # decide which char to use for inv tree file
    try:
        invtrees_html = open(folder_path + '\simpleinvtree_' + frst + '.html',
                             encoding='utf8')
    except FileNotFoundError as e:  #if there is a failure to open the file
        return
    invtrees_html = BeautifulSoup(invtrees_html, 'html.parser')
    invtree_blocks = str(invtrees_html).split(
        "\n\n")  #break the whole page into individual trees
    inv_list_text = []  #will contain the current invocation tree

    # Below is to see if the method is a constructor. This important because in Understand docs the
    # way that it links the constructor to its place in the data dictionary changes
    # depending on whether the constructor is being called or is the caller (Frustrating!), making it impossible to
    # automatically match the constructor tree from where it is being called to its tree automatically,
    # which is what prompts the user input when duplicates are discovered.
    test_for_constructor = method_name.split('::')
    if len(test_for_constructor) < 2:
        isConstructor = False
    else:
        if test_for_constructor[0].strip() == test_for_constructor[1].strip():
            isConstructor = True
        else:
            isConstructor = False
    if dict_link == '' or isConstructor:
        inv_lists = [
        ]  #This list will store all of the invocation trees that the script finds that come from the same Class::Method
    #-----regex compilation-----#
    get_link_regex = re.compile(r'\<a href=\"dictionary_.+?\.html#(.+)\"\>')
    #---------------------------#
    for tree in invtree_blocks:  #search for the right tree
        original_block = tree
        tree = BeautifulSoup(tree, 'html.parser')
        tree = str(tree).replace(
            "|", "")  # take out the | symbol understand puts in the html
        tree = tree.split(
            '\n'
        )  # change the tree into a list where [0] == invoker and [1:] are the invocations
        tree = list(map(str.strip,
                        tree))  # strip each of the strings in the list
        invoker_html = tree[0]  # still in HTML
        invoker_text = BeautifulSoup(invoker_html, 'html.parser')
        invoker_text = invoker_text.get_text()  # turn to text
        if invoker_text == method_name:  #found a potentially correct tree
            if dict_link == '' or isConstructor:
                inv_lists.append(original_block)  #collect potential lists
                continue
            else:  # normal behavior
                tmp_link = get_link_regex.search(invoker_html).group(
                    1)  # get the dictionary link
                if tmp_link == dict_link:  # we have the right name and dict_link
                    inv_list_text.append(invoker_text + '#' + tmp_link)
                    for index in range(
                            1, len(tree)
                    ):  #loop through the tree and turn the HTML to text and add to inv_list_text
                        tmp_link = get_link_regex.search(tree[index])
                        method_text = BeautifulSoup(tree[index], 'html.parser')
                        method_text = method_text.get_text()
                        if tmp_link == None and method_text == '':  # this if statement is to capture when the invocation tree is at the bottom of the page so the .split '\n\n' didn't work
                            #handle when it as at the end of the page
                            table_text = BeautifulSoup(tree[index + 1],
                                                       'html.parser')
                            table_text = table_text.get_text()
                            if table_text == 'Non-AlphaABCDEFGHIJKLMNOPQRSTUVWXYZ':  #search for table at the bottom of the page
                                break
                        inv_list_text.append(method_text + '#' +
                                             tmp_link.group(1))
                    break  # found the right tree so we can stop iterating over them
#---------------Handling first time methods------------------#
    if dict_link == '':  # this block is for when the method is passed in at the start of the code, so no dict link
        if dict_link == '' and len(inv_lists) == 0:  #no invocations found
            print(
                "\n\nThis function doesn't have any interfaces! Try a different one."
            )  # user gave a function without interfaces
            exit()
        if dict_link == '' and len(
                inv_lists
        ) == 1:  # we found an invocation tree that matches the name, assume it is right
            inv_list_html = inv_lists[0]
            inv_list_html = str(inv_list_html).split('\n')
        elif dict_link == '' and len(
                inv_lists
        ) > 1:  #more than one, no html link to ref yet, have to get user input to decide
            # prompt user to tell me which one they want
            count = 0
            for tree in inv_lists:  #print out each of the method's inv trees and ask which is the right one
                count += 1
                tree_text = BeautifulSoup(tree, 'html.parser')
                tree_text = tree_text.get_text()
                tmp = str(count) + ": "
                print(tmp)
                print(tree_text, '\n')
                start_letter = tree_text[0]
            # open the inv tree understand html page so the user can have some help deciding
            webbrowser.open('file://' +
                            os.path.realpath(folder_path + '\simpleinvtree_' +
                                             frst + '.html'))
            print(
                '\n\nMultiple invocation trees with same Class::Method found. Which one would you like to generate a chart with?'
            )
            print(
                'Type the number of the tree you would like to start with.\nI have opened the HTML report in your browser as well to help you decide.'
            )
            k = input()  #have the user select a number
            try:
                k = int(k)
            except:
                k = -1
            while (int(k) > count
                   or int(k) < 1):  #make sure it is a good input
                print("Invalid input: try again please")
                k = input()
                try:
                    k = int(k)
                except:
                    k = -1
            inv_list_html = inv_lists[k - 1]  # we now know which one we want
            inv_list_html = str(inv_list_html).split(
                '\n')  # list of methods with link wrapping it in html
        if dict_link == '' and len(
                inv_list_html
        ) > 0:  # if inv_list_html len is > 0 then we found something, manipulate it for recursion
            inv_list_text = []
            count = 0
            for method in inv_list_html:
                method_link = get_link_regex.search(method).group(1)
                method_text = BeautifulSoup(method, 'html.parser')
                method_text = method_text.get_text()  #turn from HTML to text
                if count > 0:  #methods other than the invoker have these | that understand puts
                    method_text = method_text.replace('| ', '')
                if count == 0:  # since the first method that is passed in by the user doesn't have a dict_link we must save it
                    start_func_dict_link[0] = method_link
                method_string = method_text + "#" + method_link
                inv_list_text.append(
                    method_string)  #get the tree in list form for recursion
                count += 1


#------------Handling when the constructor has a different dict link----------------#
    else:
        if isConstructor and len(
                inv_lists) == 0:  # there was no invocation tree found
            return
        if isConstructor and len(
                inv_lists
        ) == 1:  # only one found, this must be the correct one, assume it is correct
            # have the update dict link on the interface graph to this new one
            # use this as the invocation tree
            inv_list_html = inv_lists[0]
            inv_list_html = str(inv_list_html).split('\n')
        elif isConstructor and len(inv_lists) > 1:
            # when the constructor has many definitions, make the user select the one we want
            if SKIP_DUPLICATES:
                return  #this is for when the user selects they don't want to be prompted to select the right constructor
            count = 0
            for tree in inv_lists:  #print out the inv trees and open the web browser to help the user choose the right one
                count += 1
                tree_text = BeautifulSoup(tree, 'html.parser')
                tree_text = tree_text.get_text()
                tmp = str(count) + ": "
                print(tmp)
                print(tree_text, '\n')
                start_letter = tree_text[0]
            webbrowser.open('file://' +
                            os.path.realpath(folder_path + '\simpleinvtree_' +
                                             frst + '.html'))
            print(
                '\n\nMultiple versions of the same constructor found. Which one is called by',
                caller, '?')
            print(
                'Type the number of the correct tree.\nI have opened the HTML report in your browser as well to help you decide.\nIf you don\'t wish to decide, type \'0\''
            )
            k = input()
            try:
                k = int(k)
            except:
                k = -1
            while (int(k) > count or int(k) < 0):
                print("Invalid input: try again please")
                k = input()
                try:
                    k = int(k)
                except:
                    k = -1
            if k == 0:  #if the user selects 0, it won't continue traversing this branch
                return
            inv_list_html = inv_lists[k - 1]  # we now know which one we want
            inv_list_html = str(inv_list_html).split(
                '\n')  # list of methods with link wrapping it in html
        # get html into neat list
        if isConstructor and len(
                inv_list_html
        ) > 0:  # if inv_list_html len is > 0 then we found something, manipulate it for recursion
            inv_list_text = []
            for i in range(
                    0, len(inv_list_html)
            ):  #iterate through the tree and get it in proper text format with dict_link
                method_link = get_link_regex.search(inv_list_html[i]).group(1)
                method_text = BeautifulSoup(inv_list_html[i], 'html.parser')
                method_text = method_text.get_text()
                if i > 0:
                    method_text = method_text.replace('| ', '')
                method_string = method_text + "#" + method_link
                if i == 0:  # have to update the graph last call with the new dict link, because of the issue with the dict link being different when it is the callee vs. the caller
                    old_string = method_name + '#' + dict_link  # string to be replaced
                    prev_tree = interface_graph[caller]
                    prev_tree[prev_tree.index(old_string)] = method_string
                    interface_graph[caller] = prev_tree
                inv_list_text.append(method_string)
    #-----------Ready to fix the correct invocation list up for insert into graph----#
    # when we get here we have inv_list_text with the list of invocations in order inv_list_text[0] == invoker and [1:] invocations
    if len(inv_list_text) >= 2:  # there are invoked methods
        if inv_list_text[
                0] not in interface_graph:  # double check to make sure there won't be a cycle created
            i = 1
            while i < len(inv_list_text):
                if '(Virtual)' in inv_list_text[
                        i]:  #if virtual is on there strip it off for the graph
                    inv_list_text[i] = inv_list_text[i].replace(
                        '  (Virtual)', '').strip()
                if inv_list_text[i] in interface_graph or inv_list_text[
                        i] == inv_list_text[
                            0]:  #deal with cycles caused by recursion or edges
                    # outputs to log file. This case will be triggered if a method exists in the current invocation tree and it has
                    # already been added to the graph as another vertex. This maintains integrity of acyclic quality for topological sort
                    print(
                        '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s, and is already a vertx in the interface graph.'
                        % (inv_list_text[i], inv_list_text[0]),
                        file=cycles_log)
                    # warning to the user
                    del inv_list_text[
                        i]  # delete the edge causing the cycle for the sake of the topological sort
                    i -= 1
                i += 1
            if len(
                    inv_list_text
            ) >= 2:  # after del edges that make cycles make sure there at least one edge coming from the vertex
                interface_graph[inv_list_text[0]] = inv_list_text[1:]
            else:
                return
        else:  # this only gets triggered if the dict_link of the invoker method changed between the start of this method's execution and here
            print(
                '\nWARNING: Possible cycle found in invocation tree. %s was invoked by %s and is already a vertx in the interface graph.'
                % (inv_list_text[0], caller),
                file=cycles_log)
            return
    for i in range(1, len(inv_list_text)
                   ):  #time to recurse over the branches left in the tree
        tmp = inv_list_text[i].split(
            '#')  # break the method from its dict link
        method_name = tmp[0]
        dict_link = tmp[1]
        # first check if the method is in the DB, if it is we assume that that the branch from that method has been fully traversed
        if not methodInDB(method_name, dict_link, crsr)[0]:
            getInterfacesFromReport(
                method_name, dict_link, folder_path, crsr,
                inv_list_text[0])  # calls getInt. on next method
Ejemplo n.º 50
0
            text = BeautifulSoup(text, features="html.parser")

            # removes html tags
            text = text.get_text()

            # removes target=blank Markdown tags
            text = text.replace("{:target=\"_blank\"}", '')

            # removes Markdown links
            text = regex.sub(linkremover, '', text)

            # removes anything that isn't an alphabetical character and casts the remaining string to lowercase
            text = regex.sub(nonalphabeticalremover, ' ', text).lower()

            wordcount += len(text.split())

            # nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/
            tokens = nltk.word_tokenize(text)
            stemmer = nltk.stem.PorterStemmer()
            stemmed_tokens = map(lambda x: stemmer.stem(x), tokens)

            for token in stemmed_tokens:
                if token in uniquewords:
                    newVal = uniquewords.get(token) + 1
                    uniquewords.update({token: newVal})
                else:
                    uniquewords.update({token: 1})
        continue

if wordcount < 1:
Ejemplo n.º 51
0
        for srow in standinglist:
            srowPrintable = str(srow)
            srowPrintable = srowPrintable.replace('[', '')
            srowPrintable = srowPrintable.replace(']', '')
            srowPrintable = srowPrintable.replace('\'', '')
            srowCSV = srowPrintable.split(',')
            swr.writerow(srowCSV)

    with open('ol.csv', 'w', encoding="utf-8", newline='') as ol_file:
        olwr = csv.writer(ol_file)
        for olrow in oltable_rows:
            olrowPrintable = str(olrow)
            olrowPrintable = olrowPrintable.replace(',', '.')
            olrowPrintable = olrowPrintable.replace(' (R)', '')
            olrowPrintable = BeautifulSoup(olrowPrintable, "lxml").get_text(separator=',')
            olrowCSV = olrowPrintable.split(',')
            olwr.writerow(olrowCSV)

    with open('r.csv', 'w', encoding="utf-8", newline='') as r_file:
        rwr = csv.writer(r_file)
        for rrow in rtable_rows:
            rrowPrintable = str(rrow)
            rrowPrintable = rrowPrintable.replace(',', '.')
            rrowPrintable = rrowPrintable.replace(' (R)', '')
            rrowPrintable = BeautifulSoup(rrowPrintable, "lxml").get_text(separator=',')
            rrowCSV = rrowPrintable.split(',')
            rwr.writerow(rrowCSV)


    with open('p.csv', 'w', encoding="utf-8", newline='') as p_file:
        pwr = csv.writer(p_file)
Ejemplo n.º 52
0
                # removes html tags
                text = text.get_text()

                # removes target=blank Markdown tags
                text = text.replace("{:target=\"_blank\"}", '')

                # removes Markdown links
                text = regex.sub(linkremover, '', text)

                # removes anything that isn't an alphabetical character and casts the remaining string to lowercase
                text = regex.sub(nonalphabeticalremover, ' ', text).lower()

                blob = TextBlob(text)

                sentiments.update({filename: blob.sentiment.polarity})
                wordsperpost.update({filename: len(text.split())})

                wordcount += len(text.split())

                # nltk stemming/token magic from http://ryancompton.net/2014/06/06/statistical-features-of-infinite-jest/
                tokens = nltk.word_tokenize(text)
                stemmer = nltk.stem.PorterStemmer()
                stemmed_tokens = map(lambda x: stemmer.stem(x), tokens)

                for token in stemmed_tokens:
                    if token in stems:
                        newVal = stems.get(token) + 1
                        stems.update({token: newVal})
                    else:
                        stems.update({token: 1})
Ejemplo n.º 53
0
    stems = stem_tokens(words, stemmer)
    return stems

#Pre-processing step
print("Pre-processing documents...")
for file in file_name:
    if(counter < 201):
        start = timer()
        with open(os.path.join(folder_dir,rel_path,file), 'rb') as f:
            read_data = f.read() #Read from file

        input_str = BeautifulSoup(read_data, "lxml").get_text() # Extract text from document
        input_str = input_str.casefold() #Convert to lower-case
        input_str = re.sub(r'\d+', '', input_str) #Remove numbers 
        input_str = input_str.translate(str.maketrans("","",string.punctuation)) #Remove punctuation
        input_str = " ".join(input_str.split()) #Removes whitespaces
        input_str = input_str.replace("\n"," ") #Removes newline
        input_str = unicodedata.normalize("NFKD", input_str) #Removes unicode characters.
        corpus[file] = input_str
        print(counter)
        counter+=1
        f.close()
    else:
        break   
#print(list(corpus.values())[0]) --Print first document's text for testing
values = []
files = []

for k,v in corpus.items():
    values.append(v)
    files.append(k)
 def beautify(self,element):
     ans= BeautifulSoup(str(element),'lxml').text
     ans=ans[1:-1]
     return ans.split(', ')
Ejemplo n.º 55
0
def collectGlobals(folder_path):
    """
    Method to get all of the global and public objects from the understand docs
    It uses the the object cross reference pages of the understand docs

    """
    print("Collecting global/public objects from this project...")
    glbl_connection = sql.connect("globals.db")  #creates a globals.db file
    glbl_crsr = glbl_connection.cursor()  #crsr to execute commands

    create_globals_table = """CREATE TABLE globals (
        global_id INTEGER PRIMARY KEY,
        var_name VARCHAR(255),
        method_used VARCHAR(200), /* the method signature of where it was used */
        use_loc VARCHAR(255) /* line where it was used */
        );"""
    glbl_crsr.execute(
        create_globals_table)  #inline sql to create table for objects

    ref_page = 'Non-Alpha'
    count = 0
    #---regex compiles----#
    # stops the code from recompiling the regex and adding it to the cache every loop
    global_regex = '.+Global Object\)'  #regex to search for the objs labeled public/global
    global_regex = re.compile(global_regex)
    public_regex = '.+Public Object\)'
    public_regex = re.compile(public_regex)
    static_remove = re.compile('   \(Static')
    use_search = re.compile(' Use ')
    set_search = re.compile(' Set ')
    #---------------------#
    while ref_page == 'Non-Alpha' or (ord(ref_page) <= ord(
            'Z')):  # loop to loop through each of the pages for the variables
        print("Collecting global variables that start with ", ref_page, "...")
        objxref_html = open(folder_path + '\object_xref_' + ref_page + '.html')
        objxref_html = BeautifulSoup(
            objxref_html,
            'html.parser')  # get the page's html in a parsable object
        obj_blocks = str(objxref_html).split(
            '\n\n')  # break the page into units for each object
        for block in obj_blocks:
            matched = False
            if global_regex.search(block.split('\n')[0]):
                obj = 'Global'
                matched = True
            elif public_regex.search(block.split('\n')[0]):
                obj = 'Public'
                matched = True
            if matched:  #if the block has been identified as public/global object
                block_txt = BeautifulSoup(block, 'html.parser').getText()
                block_txt = block_txt.split('\n')
                tmp = block_txt[0].split(
                    '%s Object)  Declared as: ' %
                    obj)  #split up the first line's information

                if len(tmp) > 1:  #there is a Declared as:
                    var_name = tmp[1].strip() + "   " + tmp[0].strip()
                else:  # no Declared as
                    var_name = tmp[0].split()[0].strip()
                if static_remove.search(
                        var_name):  #get rid of the static part if it's there
                    var_name = var_name[:-8].strip()
                elif '(' in var_name:
                    var_name = var_name[:-2].strip()
                used_in_methods = {
                }  # now search the next lines of the block to find which methods the obj is used in
                for line in block_txt[1:]:
                    if (use_search.search(line)) or (set_search.search(line)):
                        line = line.split()
                        use_loc = line[-3] + " " + line[-2]
                        method_used = line[-1].strip()
                        if method_used not in used_in_methods:  #for the case where there are multiple lines in one method
                            used_in_methods[method_used] = [
                                use_loc
                            ]  # where the obj is used method = key, lines used = val
                        else:
                            used_in_methods[method_used].append(use_loc)
                if len(used_in_methods) > 0:
                    for method in used_in_methods:  # if there were places where the obj was used insert into the db
                        glbl_crsr.execute(
                            'INSERT INTO globals (global_id, var_name, method_used, use_loc) VALUES (?, ?, ?, ?)',
                            (
                                count,
                                var_name,
                                method,
                                str(used_in_methods[method]),
                            ))
                        count += 1

        if ref_page == 'Non-Alpha':
            ref_page = 'A'
        else:
            ref_page = chr(ord(ref_page) + 1)
    # save all changes made
    glbl_connection.commit()
    # finished
    glbl_connection.close()
Ejemplo n.º 56
0
from bs4 import BeautifulSoup

f = open("words.txt", "r")
g = open("words.final.txt", "w")

for line in f:
    k = BeautifulSoup(line).text
    k = " ".join([
        x for x in k.split(" ")
        if len(x) > 5 and not (x.startswith("@") or x.startswith(".mas")
                               or "http" in x or "png" in x or "jpg" in x)
    ])
    if len(k) > 1:
        g.write("{}\n".format(k.strip()))

f.close()
g.close()
Ejemplo n.º 57
0
def update(update, context):
    """
    Обновляет ленту, отправляя новые посты с rss ленты
    """
    try:
        with open('sites.json', 'r') as file:
            site_list = json.load(file)

        new_site_list = []
        for site in site_list:
            raw_news = feedparser.parse(
                site['url'])['entries'][::-1]  #запись новостей в raw файл

            date = site['date']
            keywords = site['keywords']
            new_date = raw_news[-1]['published']
            if keywords == [] and date == None:
                for new in raw_news:
                    time.sleep(0.01)
                    summary = BeautifulSoup(new['summary'],
                                            'html.parser').get_text()
                    summary = sentence(summary)
                    mes = new['title'] + "\n\n" + summary + "\n" + new[
                        'link']  #создание сообщения для отправки

                    context.bot.send_message(chat_id=update.effective_chat.id,
                                             text=mes)

            elif keywords == [] and date != None:
                for new in raw_news:
                    time.sleep(0.01)
                    summary = BeautifulSoup(new['summary'],
                                            'html.parser').get_text()
                    summary = sentence(summary)
                    if new['published'] != date:
                        mes = new['title'] + "\n\n" + summary + "\n" + new[
                            'link']  #создание сообщения для отправки
                        context.bot.send_message(
                            chat_id=update.effective_chat.id, text=mes)
                    else:
                        break

            elif keywords != [] and date == None:
                time.sleep(0.01)
                stemmer = SnowballStemmer("russian")
                kwords = set([stemmer.stem(i) for i in keywords])

                for new in raw_news:
                    summary = BeautifulSoup(new['summary'],
                                            'html.parser').get_text()
                    summary = sentence(summary)
                    word_set = set(
                        [stemmer.stem(i) for i in summary.split(' ')])
                    if word_set & kwords != set():
                        mes = new['title'] + "\n\n" + summary + "\n" + new[
                            'link']  #создание сообщения для отправки
                        context.bot.send_message(
                            chat_id=update.effective_chat.id, text=mes)

            elif keywords != [] and date != None:
                time.sleep(0.01)
                stemmer = SnowballStemmer("russian")
                kwords = set([stemmer.stem(i) for i in keywords])

                for new in raw_news:

                    summary = BeautifulSoup(new['summary'],
                                            'html.parser').get_text()
                    summary = sentence(summary)
                    word_set = set(
                        [stemmer.stem(i) for i in summary.split(' ')])
                    if word_set & kwords != set() and new['published'] != date:
                        mes = new['title'] + "\n\n" + summary + "\n" + new[
                            'link']  #создание сообщения для отправки
                        context.bot.send_message(
                            chat_id=update.effective_chat.id, text=mes)
                    else:
                        break

            new_site_list.append({
                'name': site['name'],
                'url': site['url'],
                'date': new_date,
                'keywords': site['keywords']
            })

        with open('sites.json', 'w') as file:
            json.dump(new_site_list, file)

    except FileNotFoundError:
        context.bot.send_message(
            chat_id=update.effective_chat.id,
            text='Нет списка сайтов, задайте его командой /sub')

    except Exception as exxx:
        context.bot.send_message(chat_id=update.effective_chat.id,
                                 text=('Возникла следующая ошибка ' +
                                       str(exxx)))
    rows = table.find_all('tr')
    #print(len(rows))
    if (flag == 0):
        row_th = rows[0].find_all('th')
        str_cells = str(row_th)
        header = BeautifulSoup(str_cells, "lxml").get_text()
        flag = 1

    for row in rows[1:]:
        row_td = row.find_all('td')
        str_cells1 = str(row_td)
        cleantext = BeautifulSoup(str_cells1, "lxml").get_text()

        # create a list from cleantext
        stop_event_rows = cleantext.split(", ")

        # strip "[" from first element of the list and "]" from the last element of the list
        x = stop_event_rows[0].split("[")
        stop_event_rows[0] = x[1]

        size = len(stop_event_rows)
        x = stop_event_rows[size - 1].split("]")
        stop_event_rows[size - 1] = x[0]

        for _ in range(len(stop_event_rows)):
            data = {}
            data["trip_id"] = trip
            data["vehicle_number"] = stop_event_rows[0]
            data["leave_time"] = stop_event_rows[1]
            data["train"] = stop_event_rows[2]
Ejemplo n.º 59
0
    def _parse_energy_level_section(str, last_data=None):

        data = {}
        splitted_str = str.split('\n')
        for i, line in enumerate(splitted_str):
            clean_str = BeautifulSoup(line.strip(), "lxml").text
            if sys.version_info[0] < 3:  # f**k python2 btw.
                clean_str = clean_str.encode("utf-8")
            
            
            if clean_str.strip() == '': continue
            
            if i == 0: data['configuration'] = clean_str.replace('\xa0', '')
            
            if i == 1: data['term'] = clean_str.replace('\xa0', '')
            
            if i == 3:
                # print("i == 3 : ", clean_str)
                if '?' in clean_str.strip():
                    clean_str = clean_str.replace('?','')
                
                if ',' in clean_str:
                    clean_str = clean_str.split(',')[0]
                
                if '/' in clean_str.strip():
                    # resplit = re.split("a?\/a?", clean_str)
                    # data['J'] = float(resplit[0].replace(' ', '')) / float(resplit[1])
                    resplit = clean_str.strip().split('/')
                    try:
                        data['J'] = float(resplit[0]) / float(resplit[1])
                    except ValueError:
                        print("clean_str = ", clean_str)
                        exit()
                        
                    # print("data['J'] = ", data['J'])
                else:
                    try:
                        data['J'] = float(clean_str.strip())
                    except:
                        logger.error("Could not read: {0}".format(clean_str.strip()))
                        # if ',' in clean_str:
                        #     data['J'] = clean_str.strip()
                        # else:
                        #     resplit = re.split("a?\/a?", clean_str)
                        #     try:
                        #         if len(resplit) == 2:
                        #             data['J'] = float(resplit[0].replace(' ', '')) / float(resplit[1])
                        #         else:
                        #             data['J'] = int(clean_str.strip())
                        #     except ValueError:
                        #         logger.error("Could not read: {0}".format(clean_str.strip()))
                
            if i == 4:
                clean_str = clean_str.strip().replace(' ', '').replace('(','').replace(')','').replace('[','').replace(']','')
                
                refind1 = re.findall(r"\d+\.\d+", clean_str.replace(' ', ''))
                
                if type(refind1) == float:
                    data['level (eV)'] = refind1
                elif len(refind1) == 1:
                    # print("refind1 = ", refind1, " | clean_str = ", clean_str)
                    data['level (eV)'] = float(refind1[0])
                else:
                    data['level (eV)'] = float(clean_str)

                # print("refind1 = ", refind1, " | clean_str = ", clean_str, " | data['level (eV)'] = ",
                #       data['level (eV)'])
            
            try:
                if i == 5: data['uncertainty (eV)'] = float(clean_str.replace(' ', ''))
            except ValueError:
                logger.error("Could not read: {0}".format(clean_str.replace(' ', '')))
            
            if i == 6: data['level splittings (eV)'] = float(clean_str.replace(' ', ''))
            
            try:
                if i == 7: data['leading percentages'] = float(clean_str)
            except ValueError:  # leading percentage is not always there
                if i == 7: data['reference'] = clean_str.replace('\xa0','')

        if 'configuration' not in data:
            data['configuration'] = ''
            
        if 'term' not in data:
            data['term'] = ''
        
        if data['configuration'] == '':  #
            data['configuration'] = last_data['configuration']

        if data['term'] == '':
            data['term'] = last_data['term']
        
        return data
Ejemplo n.º 60
-25
def parsePage(url):
	r = requests.get(url)
	data = r.text
	soup = BeautifulSoup(data)


	invalid_tags = ['b', 'i', 'u', 'ul','li', 'p','em']
	soup = soup.find(id='primary')



	for tag in invalid_tags: 
	 for match in soup.findAll(tag):
	     match.replaceWithChildren()
	    

	for match in soup.findAll('span'):
		match.replaceWith('')

	for match in soup.findAll('div'):
		match.replaceWith('')


	soup = str(soup)
	soup = soup.replace('<strong>', "%")
	soup = soup.replace('</strong>', "%")
	finalOutput = soup.split('%')

	for n in range(0,4):
		finalOutput[n]=""

	return finalOutput