Ejemplo n.º 1
0
def main():
    args = parser.parse_args()

    if not args.ignore_index and os.path.isfile(args.index):
        with open(args.index, 'r') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
            lines = [line for line in lines if line]
    else:
        lines = []

    if args.url in lines:
        print "EXISTENT"
        return 0

    try:
        content = webarticle2text.extractFromURL(args.url)
        content = process_content(content)
    except Exception as e:
        log(args.log, args.url + ": Exception " + str(e.__class__))
        log(args.log, e.message)
        log(args.log, traceback.format_exc())

        print "FAIL"
        return 1

    if len(content) < 200:
        log(args.log, args.url + ": Invalid content")
        print "FAIL"
        return 2

    try:
        m = re.match('(?:https?://)?(?:www\.)?(.+?)\..*$', args.url)
        source = m.groups()[0]
        # Post-process the content
        with open(args.output, 'a') as f:
            f.write('{' + os.linesep)
            f.write('  "tag": "{}",'.format(args.tag.upper()) + os.linesep)
            f.write('  "content": "{}",'.format(content) + os.linesep)
            f.write('  "source": "{}",'.format(source) + os.linesep)
            f.write('  "url": "{}"'.format(args.url) + os.linesep)
            f.write('},' + os.linesep)
        log(args.log, args.url + ": Data written")

        if not args.ignore_index:
            ensure_directory(args.index)
            with open(args.index, 'a+') as f:
                f.write(args.url + os.linesep)

        print "SUCCESS"
        return 0

    except Exception as e:
        log(
            args.log, args.url + ": Can't write file due to Exception " +
            str(e.__class__))
        log(args.log, args.url + ": " + e.message)
        log(args.log, args.url + ": " + traceback.format_exc())
        print "FAIL"
        return 1
Ejemplo n.º 2
0
def main():
    args = parser.parse_args()

    if not args.ignore_index and os.path.isfile(args.index):
        with open(args.index, 'r') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
            lines = [line for line in lines if line]
    else:
        lines = []

    if args.url in lines:
        print "EXISTENT"
        return 0

    try:
        content = webarticle2text.extractFromURL(args.url)
        content = process_content(content)
    except Exception as e:
        log(args.log, args.url + ": Exception " + str(e.__class__))
        log(args.log, e.message)
        log(args.log, traceback.format_exc())

        print "FAIL"
        return 1

    if len(content) < 200:
        log(args.log, args.url + ": Invalid content")
        print "FAIL"
        return 2

    try:
        m = re.match('(?:https?://)?(?:www\.)?(.+?)\..*$', args.url)
        source = m.groups()[0]
        # Post-process the content
        with open(args.output, 'a') as f:
            f.write('{' + os.linesep)
            f.write('  "tag": "{}",'.format(args.tag.upper()) + os.linesep)
            f.write('  "content": "{}",'.format(content) + os.linesep)
            f.write('  "source": "{}",'.format(source) + os.linesep)
            f.write('  "url": "{}"'.format(args.url) + os.linesep)
            f.write('},' + os.linesep)
        log(args.log, args.url + ": Data written")

        if not args.ignore_index:
            ensure_directory(args.index)
            with open(args.index, 'a+') as f:
                f.write(args.url + os.linesep)

        print "SUCCESS"
        return 0

    except Exception as e:
        log(args.log, args.url + ": Can't write file due to Exception " + str(e.__class__))
        log(args.log, args.url + ": " + e.message)
        log(args.log, args.url + ": " + traceback.format_exc())
        print "FAIL"
        return 1
Ejemplo n.º 3
0
 def fetch(url):
     #request = urllib2.Request(url)
     #f = urllib2.urlopen(request)
     articles.append(webarticle2text.extractFromURL(url, timeout=60))
     articleURLs.append(url)
     print(articleURLs[len(articleURLs) - 1])
     print(articles[len(articles) - 1])
     #f.close()
     print "\r" + str(len(articles)) + " / " + str(len(results)) + " articles crawled to.",
     sys.stdout.flush()
def startanalyzing():
	file_path=open("output/uri_list.txt","r")
	url = file_path.readline()
	while url:	

		try:
			data = webarticle2text.extractFromURL(url)
		except Exception:
			#print "[$]Unable to fetch url :"+url	
			time.sleep(2)
			#clear(0)	
			
		text = nltk.word_tokenize(data)
		tagged_data = nltk.pos_tag(text)

		grammar = "NP: {<DT>?<JJ>*<NN>}"

		cp = nltk.RegexpParser(grammar)
		result_tree=cp.parse(tagged_data)

		pairs = relextract.mk_pairs(result_tree)

		reldicts = relextract.mk_reldicts(pairs)
		
		loop=0
		while loop < len(reldicts):
			for k,v in reldicts[loop].items():
			     #print k, '=>', v
			     if k == "objsym":
					objsym = v
					objsym=removeNonAscii(objsym)
			     if k == "subjsym":
					subjsym = v
					subjsym=removeNonAscii(subjsym)
			     if k == "lcon":
					lcon = v
					lcon = ((re.sub('/[A-Z]+','',lcon)).replace(",/,","")).replace("'","")	
					lcon=removeNonAscii(lcon)
			     if k == "filler":
					filler = v
					filler = ((re.sub('/[A-Z]+','',filler)).replace(",/,","")).replace("'","")	
					filler=removeNonAscii(filler)
			     if k == "rcon":
					rcon = v
					rcon = ((re.sub('/[A-Z]+','',rcon)).replace(",/,","")).replace("'","")
					rcon=removeNonAscii(rcon)
			if len(subjsym) > 4 and len(lcon) > 4 and len(rcon) > 4 and len(filler) > 4:
			    update_db(url,objsym,subjsym,lcon,filler,rcon)
			loop+=1
		url = file_path.readline()
Ejemplo n.º 5
0
 def retrieve_article_content(self, force=False):
     import webarticle2text
     if self.article_content and not force:
         return
     self.article_content = webarticle2text.extractFromURL(
         self.link,
         only_mime_types=conf.GET_ARTICLE_CONTENT_ONLY_MIME_TYPES,
         ignore_robotstxt=True,
         userAgent=ua.random)
     self.article_content_error_code = None
     self.article_content_error_reason = None
     self.article_content_success = bool((self.article_content or '').strip())
     self.article_content_error = None
     self.save()
Ejemplo n.º 6
0
    def collect_with_feedzilla():
        news = Feedzilla.collect()
        for n in news:
            cluster = Cluster()
            cluster.display_name = n["title"]
            cluster.relevancy = 1.0 / n["search_place"] * 100
            cluster.topic = Topic.objects.get(short_name=str(n["topic"]))
            cluster.date = format_date(n["publish_date"])
            cluster.save()
            if (n["check_url"]):
                try:
                    cluster_content = extractFromURL(n["url"])
                    cluster_countries = Placemaker.get_countries_from_string(
                        cluster_content)
                    for country in cluster_countries:
                        try:
                            l = Location.objects.get(name=country)
                        except Location.DoesNotExist:
                            l = Location()
                            l.name = country
                            l.save()
                        cluster.location.add(l)

                    cluster_continents = Placemaker.get_continents_from_countries(
                        cluster_countries)
                    for continent in cluster_continents:
                        try:
                            c = Continent.objects.get(name=continent)
                        except Continent.DoesNotExist:
                            c = Continent()
                            c.name = continent
                            c.save()
                        cluster.continent_location.add(c)
                except TypeError:
                    print "webarticle2text Error"

            a = Article()
            a.title = n["title"]
            a.url = n["url"]
            a.publisher = n["source"]
            a.content = n["summary"][:999]
            a.published_date = format_date(n["publish_date"])
            a.cluster = cluster
            a.save()
Ejemplo n.º 7
0
def scrape_hn(hn_url):
	parser = feedparser.parse(hn_url)
	for entry in parser.entries:
		url = entry.link
		_id = md5.md5(url).hexdigest()
		title = entry.title
		try:
			body = webarticle2text.extractFromURL(url)
			wordcount = len(body.split(" "))
		except:
			wordcount = -1
		
		coll.update(
		{"_id": _id},
		{"$set": {"title": title,
				  "wordcount": wordcount}},
		upsert = True)
		
		print 'Added url %s' % url
Ejemplo n.º 8
0
    def collect_with_feedzilla():
        news = Feedzilla.collect()
        for n in news:
            cluster = Cluster()
            cluster.display_name = n["title"]
            cluster.relevancy = 1.0 / n["search_place"] * 100
            cluster.topic = Topic.objects.get(short_name=str(n["topic"]))
            cluster.date = format_date(n["publish_date"])
            cluster.save()
            if n["check_url"]:
                try:
                    cluster_content = extractFromURL(n["url"])
                    cluster_countries = Placemaker.get_countries_from_string(cluster_content)
                    for country in cluster_countries:
                        try:
                            l = Location.objects.get(name=country)
                        except Location.DoesNotExist:
                            l = Location()
                            l.name = country
                            l.save()
                        cluster.location.add(l)

                    cluster_continents = Placemaker.get_continents_from_countries(cluster_countries)
                    for continent in cluster_continents:
                        try:
                            c = Continent.objects.get(name=continent)
                        except Continent.DoesNotExist:
                            c = Continent()
                            c.name = continent
                            c.save()
                        cluster.continent_location.add(c)
                except TypeError:
                    print "webarticle2text Error"

            a = Article()
            a.title = n["title"]
            a.url = n["url"]
            a.publisher = n["source"]
            a.content = n["summary"][:999]
            a.published_date = format_date(n["publish_date"])
            a.cluster = cluster
            a.save()
Ejemplo n.º 9
0
def scrape_reddit():
	docs = reddit.Reddit(user_agent='app').get_front_page(limit=250)
	for item in docs:
		url = item.url
		_id = md5.md5(url).hexdigest()
		title = item.title
		try:
			body = webarticle2text.extractFromURL(url)
			wordcount = len(body.split(" "))
		except:
			wordcount = -1
			
		coll.update(
				{"_id": _id},
				{"$set": {"title": title,
						  "wordcount": wordcount,
						  "url": url}},
				upsert = True)
				
		print 'Added url %s' % url		
Ejemplo n.º 10
0
def scrape_hn(hn_url):
    parser = feedparser.parse(hn_url)
    for entry in parser.entries:
        url = entry.link
        _id = md5.md5(url).hexdigest()
        title = entry.title
        try:
            body = webarticle2text.extractFromURL(url)
            wordcount = len(body.split(" "))
        except:
            wordcount = -1

        coll.update({"_id": _id},
                    {"$set": {
                        "title": title,
                        "wordcount": wordcount
                    }},
                    upsert=True)

        print 'Added url %s' % url
def getSentimentOfArticle(articleURL, articleNumber, sentimentsFileName, dateToSearch, mutex_writefile):
    tries = 0
    sentiment = ""
    print "\tArticle #" + str(articleNumber) + " for date " + dateToSearch.strftime("%m/%d/%Y") + " being analyzed..."
    while (tries < MAX_TRIES):
        try:
            sentiment = sentimentToNumber(datum_box.sentiment_analysis(webarticle2text.extractFromURL(articleURL)))
            break;
        except socket.timeout:
            print("\t ^^Article #" + str(articleNumber) + " timed out " + str(tries + 1) + " time(s)...")
            tries = tries + 1
    if ( tries == MAX_TRIES):
        return
    mutex_writefile.acquire()
    sentimentsFile = open(sentimentsFileName, 'a')
    sentimentsFile.write(articleURL + seperator)
    sentimentsFile.write(dateToSearch.strftime("%m/%d/%Y") + seperator)
    sentimentsFile.write(sentiment);
    sentimentsFile.write("\n")
    sentimentsFile.close()
    mutex_writefile.release()
Ejemplo n.º 12
0
def scrape_reddit():
    docs = reddit.Reddit(user_agent='app').get_front_page(limit=250)
    for item in docs:
        url = item.url
        _id = md5.md5(url).hexdigest()
        title = item.title
        try:
            body = webarticle2text.extractFromURL(url)
            wordcount = len(body.split(" "))
        except:
            wordcount = -1

        coll.update(
            {"_id": _id},
            {"$set": {
                "title": title,
                "wordcount": wordcount,
                "url": url
            }},
            upsert=True)

        print 'Added url %s' % url
import webarticle2text
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
baseUrl = 'http://www.mrqe.com'
o = DictWriter(
    open("/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review_text.csv",
         'w'), ["Original_Title", "critic", "link", "text"])
o.writeheader()
with open('/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/reviews.json',
          'r') as f:
    data = json.load(f)
    for idx, item in enumerate(data):
        if item['links']:
            try:
                text = webarticle2text.extractFromURL(item['links'])
                if text:
                    o.writerow({
                        'Original_Title': item['Original_Title'],
                        'link': item['links'],
                        'text': text,
                        'critic': item['critic'],
                    })
            except:
                print(
                    json.dumps(item,
                               sort_keys=True,
                               indent=2,
                               separators=(',', ': ')))
                print(idx, item['links'], " Unexpected error:",
                      sys.exc_info()[0])
Ejemplo n.º 14
0
    def final_tuples(self):

        id_list = self.return_unique_ids()
        for artl in self.artl_json:
            if "embersId" in artl:
                artl_id = artl["embersId"]
            else:
                artl_id = artl["embers_id"]
            if artl_id in id_list:
                id_list.remove(artl_id)
            else:
                continue

            if artl[u'location'][u'country'] not in self.country_list:
                continue
            latitude_num = float(artl[u'location'][u'lat'])
            try:
                longitude_num = float(artl[u'location'][u'lng'])
            except Exception:
                longitude_num = float(artl[u'location'][u'lon'])
            artl_dt = dtparse(artl['date']).date()
            if artl_dt < self.timestamp[0] or artl_dt >= self.timestamp[len(self.timestamp) - 1]:
                continue
            try:
                finalURL = (urlopen(artl['link'])).geturl()
                article_source = urlparse.urlparse(finalURL).netloc
                articleprovince = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[2]
                articlecountry = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[1]
            except Exception:
                continue
            if 'BasisEnrichment' not in artl:
                try:
                    content_web = webarticle2text.extractFromURL(finalURL)
                except Exception:
                    content_web = ""
                content_descr = artl['descr']
                tokens = nltk.word_tokenize(content_descr)
                try:
                    tokens_1 = nltk.word_tokenize(content_web)
                    for word in tokens_1:
                        tokens.append(word)
                except Exception:
                    tokens_1 = []
            else:
                POS_list = ["DIG", "PUNCT", "SYM", "SENT", "CM"]
                if not(not(artl['BasisEnrichment']['tokens'])):
                    tokenlist = artl['BasisEnrichment']['tokens']
                for element in tokenlist:
                    if element['POS'] not in POS_list:
                        tokens.append(element['value'])
            token_filtered = []
            token_normalized = []
            for a in xrange(len(self.timestamp) - 1):
                if self.timestamp[a] <= artl_dt < self.timestamp[a + 1]:
                    timestampindex = self.start_ind + a
                    break
            for word in tokens:
                word_split = re.split('(\W+)', word)
                if len(word_split) == 1:
                    if len(word_split[0]) > 2 and len(word_split[0]) < 15:
                        token_filtered.append(word)
                    else:
                        continue
                elif (len(word_split) == 3 and word_split[2] == '' and len(word_split[0]) > 2 and len(word_split[0]) < 15):
                    token_filtered.append(word_split[0])
            for word in token_filtered:
                try:
                    if not self.contains_digits(word) and word not in self.stop_words:
                        token_normalized.append(utils.normalize_str(word))
                except Exception:
                    continue
            token_unique = list(set(token_normalized))
            for word in token_unique:
                self.final_dict[(word, (articleprovince, articlecountry), article_source, timestampindex)] += token_normalized.count(word)
        return 
Ejemplo n.º 15
0
import webarticle2text
print webarticle2text.extractFromURL("http://sanguoshaenglish.blogspot.com/2010/07/liu-bei.html")
Ejemplo n.º 16
0
import webarticle2text
import sys

with open('HOCTranscriptURL.txt') as f:
    urllist = f.read().splitlines()
for url in urllist:
    filename = url.split("=")[-1]
    filename = "./Transcript/HOC_" + filename + ".txt"
    article = webarticle2text.extractFromURL(url)
    with open(filename, 'a') as the_file:
        the_file.write(article)
    print filename + " written!"

file_exists = os.path.isfile(
    "/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review2_text.csv")
o = DictWriter(
    open("/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/review2_text.csv",
         'a', 0), ["id", "Original_Title", "domain", "critic", "link", "text"])
if not file_exists:
    o.writeheader()
with open('/home/ashwin/WorkBench/Dreams/ML/Scrapy/tutorial/reviews2.csv',
          'r') as f:
    reader = DictReader(f)
    for item in reader:
        if int(item['id']) <= progress:
            continue
        if item['link']:
            try:
                text = webarticle2text.extractFromURL(baseUrl + item['link'])
                if text:
                    o.writerow({
                        'Original_Title': item['Original_Title'],
                        'id': item['id'],
                        'domain': item['domain'],
                        'link': item['link'],
                        'text': text,
                        'critic': item['critic'],
                    })
            except:
                print(
                    json.dumps(item,
                               sort_keys=True,
                               indent=2,
                               separators=(',', ': ')))
Ejemplo n.º 18
0
import webarticle2text
import sys
# print webarticle2text.extractFromURL("http://sanguoshaenglish.blogspot.com/2010/07/liu-bei.html")
article = webarticle2text.extractFromURL(sys.argv[1])