def main(): args = parser.parse_args() if not args.ignore_index and os.path.isfile(args.index): with open(args.index, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] lines = [line for line in lines if line] else: lines = [] if args.url in lines: print "EXISTENT" return 0 try: content = webarticle2text.extractFromURL(args.url) content = process_content(content) except Exception as e: log(args.log, args.url + ": Exception " + str(e.__class__)) log(args.log, e.message) log(args.log, traceback.format_exc()) print "FAIL" return 1 if len(content) < 200: log(args.log, args.url + ": Invalid content") print "FAIL" return 2 try: m = re.match('(?:https?://)?(?:www\.)?(.+?)\..*$', args.url) source = m.groups()[0] # Post-process the content with open(args.output, 'a') as f: f.write('{' + os.linesep) f.write(' "tag": "{}",'.format(args.tag.upper()) + os.linesep) f.write(' "content": "{}",'.format(content) + os.linesep) f.write(' "source": "{}",'.format(source) + os.linesep) f.write(' "url": "{}"'.format(args.url) + os.linesep) f.write('},' + os.linesep) log(args.log, args.url + ": Data written") if not args.ignore_index: ensure_directory(args.index) with open(args.index, 'a+') as f: f.write(args.url + os.linesep) print "SUCCESS" return 0 except Exception as e: log( args.log, args.url + ": Can't write file due to Exception " + str(e.__class__)) log(args.log, args.url + ": " + e.message) log(args.log, args.url + ": " + traceback.format_exc()) print "FAIL" return 1
def main(): args = parser.parse_args() if not args.ignore_index and os.path.isfile(args.index): with open(args.index, 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] lines = [line for line in lines if line] else: lines = [] if args.url in lines: print "EXISTENT" return 0 try: content = webarticle2text.extractFromURL(args.url) content = process_content(content) except Exception as e: log(args.log, args.url + ": Exception " + str(e.__class__)) log(args.log, e.message) log(args.log, traceback.format_exc()) print "FAIL" return 1 if len(content) < 200: log(args.log, args.url + ": Invalid content") print "FAIL" return 2 try: m = re.match('(?:https?://)?(?:www\.)?(.+?)\..*$', args.url) source = m.groups()[0] # Post-process the content with open(args.output, 'a') as f: f.write('{' + os.linesep) f.write(' "tag": "{}",'.format(args.tag.upper()) + os.linesep) f.write(' "content": "{}",'.format(content) + os.linesep) f.write(' "source": "{}",'.format(source) + os.linesep) f.write(' "url": "{}"'.format(args.url) + os.linesep) f.write('},' + os.linesep) log(args.log, args.url + ": Data written") if not args.ignore_index: ensure_directory(args.index) with open(args.index, 'a+') as f: f.write(args.url + os.linesep) print "SUCCESS" return 0 except Exception as e: log(args.log, args.url + ": Can't write file due to Exception " + str(e.__class__)) log(args.log, args.url + ": " + e.message) log(args.log, args.url + ": " + traceback.format_exc()) print "FAIL" return 1
def fetch(url): #request = urllib2.Request(url) #f = urllib2.urlopen(request) articles.append(webarticle2text.extractFromURL(url, timeout=60)) articleURLs.append(url) print(articleURLs[len(articleURLs) - 1]) print(articles[len(articles) - 1]) #f.close() print "\r" + str(len(articles)) + " / " + str(len(results)) + " articles crawled to.", sys.stdout.flush()
def startanalyzing(): file_path=open("output/uri_list.txt","r") url = file_path.readline() while url: try: data = webarticle2text.extractFromURL(url) except Exception: #print "[$]Unable to fetch url :"+url time.sleep(2) #clear(0) text = nltk.word_tokenize(data) tagged_data = nltk.pos_tag(text) grammar = "NP: {<DT>?<JJ>*<NN>}" cp = nltk.RegexpParser(grammar) result_tree=cp.parse(tagged_data) pairs = relextract.mk_pairs(result_tree) reldicts = relextract.mk_reldicts(pairs) loop=0 while loop < len(reldicts): for k,v in reldicts[loop].items(): #print k, '=>', v if k == "objsym": objsym = v objsym=removeNonAscii(objsym) if k == "subjsym": subjsym = v subjsym=removeNonAscii(subjsym) if k == "lcon": lcon = v lcon = ((re.sub('/[A-Z]+','',lcon)).replace(",/,","")).replace("'","") lcon=removeNonAscii(lcon) if k == "filler": filler = v filler = ((re.sub('/[A-Z]+','',filler)).replace(",/,","")).replace("'","") filler=removeNonAscii(filler) if k == "rcon": rcon = v rcon = ((re.sub('/[A-Z]+','',rcon)).replace(",/,","")).replace("'","") rcon=removeNonAscii(rcon) if len(subjsym) > 4 and len(lcon) > 4 and len(rcon) > 4 and len(filler) > 4: update_db(url,objsym,subjsym,lcon,filler,rcon) loop+=1 url = file_path.readline()
def retrieve_article_content(self, force=False): import webarticle2text if self.article_content and not force: return self.article_content = webarticle2text.extractFromURL( self.link, only_mime_types=conf.GET_ARTICLE_CONTENT_ONLY_MIME_TYPES, ignore_robotstxt=True, userAgent=ua.random) self.article_content_error_code = None self.article_content_error_reason = None self.article_content_success = bool((self.article_content or '').strip()) self.article_content_error = None self.save()
def collect_with_feedzilla(): news = Feedzilla.collect() for n in news: cluster = Cluster() cluster.display_name = n["title"] cluster.relevancy = 1.0 / n["search_place"] * 100 cluster.topic = Topic.objects.get(short_name=str(n["topic"])) cluster.date = format_date(n["publish_date"]) cluster.save() if (n["check_url"]): try: cluster_content = extractFromURL(n["url"]) cluster_countries = Placemaker.get_countries_from_string( cluster_content) for country in cluster_countries: try: l = Location.objects.get(name=country) except Location.DoesNotExist: l = Location() l.name = country l.save() cluster.location.add(l) cluster_continents = Placemaker.get_continents_from_countries( cluster_countries) for continent in cluster_continents: try: c = Continent.objects.get(name=continent) except Continent.DoesNotExist: c = Continent() c.name = continent c.save() cluster.continent_location.add(c) except TypeError: print "webarticle2text Error" a = Article() a.title = n["title"] a.url = n["url"] a.publisher = n["source"] a.content = n["summary"][:999] a.published_date = format_date(n["publish_date"]) a.cluster = cluster a.save()
def scrape_hn(hn_url): parser = feedparser.parse(hn_url) for entry in parser.entries: url = entry.link _id = md5.md5(url).hexdigest() title = entry.title try: body = webarticle2text.extractFromURL(url) wordcount = len(body.split(" ")) except: wordcount = -1 coll.update( {"_id": _id}, {"$set": {"title": title, "wordcount": wordcount}}, upsert = True) print 'Added url %s' % url
def collect_with_feedzilla(): news = Feedzilla.collect() for n in news: cluster = Cluster() cluster.display_name = n["title"] cluster.relevancy = 1.0 / n["search_place"] * 100 cluster.topic = Topic.objects.get(short_name=str(n["topic"])) cluster.date = format_date(n["publish_date"]) cluster.save() if n["check_url"]: try: cluster_content = extractFromURL(n["url"]) cluster_countries = Placemaker.get_countries_from_string(cluster_content) for country in cluster_countries: try: l = Location.objects.get(name=country) except Location.DoesNotExist: l = Location() l.name = country l.save() cluster.location.add(l) cluster_continents = Placemaker.get_continents_from_countries(cluster_countries) for continent in cluster_continents: try: c = Continent.objects.get(name=continent) except Continent.DoesNotExist: c = Continent() c.name = continent c.save() cluster.continent_location.add(c) except TypeError: print "webarticle2text Error" a = Article() a.title = n["title"] a.url = n["url"] a.publisher = n["source"] a.content = n["summary"][:999] a.published_date = format_date(n["publish_date"]) a.cluster = cluster a.save()
def scrape_reddit(): docs = reddit.Reddit(user_agent='app').get_front_page(limit=250) for item in docs: url = item.url _id = md5.md5(url).hexdigest() title = item.title try: body = webarticle2text.extractFromURL(url) wordcount = len(body.split(" ")) except: wordcount = -1 coll.update( {"_id": _id}, {"$set": {"title": title, "wordcount": wordcount, "url": url}}, upsert = True) print 'Added url %s' % url
def scrape_hn(hn_url): parser = feedparser.parse(hn_url) for entry in parser.entries: url = entry.link _id = md5.md5(url).hexdigest() title = entry.title try: body = webarticle2text.extractFromURL(url) wordcount = len(body.split(" ")) except: wordcount = -1 coll.update({"_id": _id}, {"$set": { "title": title, "wordcount": wordcount }}, upsert=True) print 'Added url %s' % url
def getSentimentOfArticle(articleURL, articleNumber, sentimentsFileName, dateToSearch, mutex_writefile): tries = 0 sentiment = "" print "\tArticle #" + str(articleNumber) + " for date " + dateToSearch.strftime("%m/%d/%Y") + " being analyzed..." while (tries < MAX_TRIES): try: sentiment = sentimentToNumber(datum_box.sentiment_analysis(webarticle2text.extractFromURL(articleURL))) break; except socket.timeout: print("\t ^^Article #" + str(articleNumber) + " timed out " + str(tries + 1) + " time(s)...") tries = tries + 1 if ( tries == MAX_TRIES): return mutex_writefile.acquire() sentimentsFile = open(sentimentsFileName, 'a') sentimentsFile.write(articleURL + seperator) sentimentsFile.write(dateToSearch.strftime("%m/%d/%Y") + seperator) sentimentsFile.write(sentiment); sentimentsFile.write("\n") sentimentsFile.close() mutex_writefile.release()
def scrape_reddit(): docs = reddit.Reddit(user_agent='app').get_front_page(limit=250) for item in docs: url = item.url _id = md5.md5(url).hexdigest() title = item.title try: body = webarticle2text.extractFromURL(url) wordcount = len(body.split(" ")) except: wordcount = -1 coll.update( {"_id": _id}, {"$set": { "title": title, "wordcount": wordcount, "url": url }}, upsert=True) print 'Added url %s' % url
import webarticle2text import sys reload(sys) sys.setdefaultencoding('utf-8') baseUrl = 'http://www.mrqe.com' o = DictWriter( open("/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review_text.csv", 'w'), ["Original_Title", "critic", "link", "text"]) o.writeheader() with open('/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/reviews.json', 'r') as f: data = json.load(f) for idx, item in enumerate(data): if item['links']: try: text = webarticle2text.extractFromURL(item['links']) if text: o.writerow({ 'Original_Title': item['Original_Title'], 'link': item['links'], 'text': text, 'critic': item['critic'], }) except: print( json.dumps(item, sort_keys=True, indent=2, separators=(',', ': '))) print(idx, item['links'], " Unexpected error:", sys.exc_info()[0])
def final_tuples(self): id_list = self.return_unique_ids() for artl in self.artl_json: if "embersId" in artl: artl_id = artl["embersId"] else: artl_id = artl["embers_id"] if artl_id in id_list: id_list.remove(artl_id) else: continue if artl[u'location'][u'country'] not in self.country_list: continue latitude_num = float(artl[u'location'][u'lat']) try: longitude_num = float(artl[u'location'][u'lng']) except Exception: longitude_num = float(artl[u'location'][u'lon']) artl_dt = dtparse(artl['date']).date() if artl_dt < self.timestamp[0] or artl_dt >= self.timestamp[len(self.timestamp) - 1]: continue try: finalURL = (urlopen(artl['link'])).geturl() article_source = urlparse.urlparse(finalURL).netloc articleprovince = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[2] articlecountry = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[1] except Exception: continue if 'BasisEnrichment' not in artl: try: content_web = webarticle2text.extractFromURL(finalURL) except Exception: content_web = "" content_descr = artl['descr'] tokens = nltk.word_tokenize(content_descr) try: tokens_1 = nltk.word_tokenize(content_web) for word in tokens_1: tokens.append(word) except Exception: tokens_1 = [] else: POS_list = ["DIG", "PUNCT", "SYM", "SENT", "CM"] if not(not(artl['BasisEnrichment']['tokens'])): tokenlist = artl['BasisEnrichment']['tokens'] for element in tokenlist: if element['POS'] not in POS_list: tokens.append(element['value']) token_filtered = [] token_normalized = [] for a in xrange(len(self.timestamp) - 1): if self.timestamp[a] <= artl_dt < self.timestamp[a + 1]: timestampindex = self.start_ind + a break for word in tokens: word_split = re.split('(\W+)', word) if len(word_split) == 1: if len(word_split[0]) > 2 and len(word_split[0]) < 15: token_filtered.append(word) else: continue elif (len(word_split) == 3 and word_split[2] == '' and len(word_split[0]) > 2 and len(word_split[0]) < 15): token_filtered.append(word_split[0]) for word in token_filtered: try: if not self.contains_digits(word) and word not in self.stop_words: token_normalized.append(utils.normalize_str(word)) except Exception: continue token_unique = list(set(token_normalized)) for word in token_unique: self.final_dict[(word, (articleprovince, articlecountry), article_source, timestampindex)] += token_normalized.count(word) return
import webarticle2text print webarticle2text.extractFromURL("http://sanguoshaenglish.blogspot.com/2010/07/liu-bei.html")
import webarticle2text import sys with open('HOCTranscriptURL.txt') as f: urllist = f.read().splitlines() for url in urllist: filename = url.split("=")[-1] filename = "./Transcript/HOC_" + filename + ".txt" article = webarticle2text.extractFromURL(url) with open(filename, 'a') as the_file: the_file.write(article) print filename + " written!"
file_exists = os.path.isfile( "/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review2_text.csv") o = DictWriter( open("/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review2_text.csv", 'a', 0), ["id", "Original_Title", "domain", "critic", "link", "text"]) if not file_exists: o.writeheader() with open('/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/reviews2.csv', 'r') as f: reader = DictReader(f) for item in reader: if int(item['id']) <= progress: continue if item['link']: try: text = webarticle2text.extractFromURL(baseUrl + item['link']) if text: o.writerow({ 'Original_Title': item['Original_Title'], 'id': item['id'], 'domain': item['domain'], 'link': item['link'], 'text': text, 'critic': item['critic'], }) except: print( json.dumps(item, sort_keys=True, indent=2, separators=(',', ': ')))
import webarticle2text import sys # print webarticle2text.extractFromURL("http://sanguoshaenglish.blogspot.com/2010/07/liu-bei.html") article = webarticle2text.extractFromURL(sys.argv[1])