def extract_tweet_URLs(text): ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing''' ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) ''' print("Extracting URLs") errors = 0 raw_tweets = text tweets = [] for t in raw_tweets: t = textclean.killgremlins(t) try: tw = json.loads(t) if "text" in tw: url = findURLs(tw["text"]) if (url != []): for u in url: tweets.append(u.replace('"', "")) except: # print(sys.exc_info()[0]) errors = errors + 1 continue print("Number of URL Errors: " + str(errors)) return tweets, errors
def extract_features(text): ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing''' ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) ''' print("Extracting Tweets") languages = {} errors = 0 raw_tweets = text text = [] tweets = [] turls = [] for t in raw_tweets: t = textclean.killgremlins(t) try: tw = json.loads(t) if "text" in tw: tweets.append(tw) text.append(tw["text"]) url = findURLs(tw["text"]) if (url != []): for u in url: turls.append(u.replace('"', "")) if "lang" in tw: if tw["lang"] in languages: languages[tw["lang"]] += 1 else: languages[tw["lang"]] = 1 except: errors = errors + 1 continue print("Number of Tweet Errors: " + str(errors)) return tweets, errors
def getWords(files): for f in files: with open(f, "r") as oFile: longString = "" for segment in oFile.readlines(): segment = segment.replace("\n", " ") segment = segment.replace("\r", " ") segment = segment.replace("\n\r", " ") segment = segment.replace("\r\n", " ") segment = segment.replace("'", "") segment = segment.replace(":", "") segment = segment.replace("!", "") segment = segment.replace("?", "") segment = segment.replace(".", "") segment = segment.replace(",", "") segment = segment.replace(";", "") segment = segment.replace("-", " ") segment = segment.replace('"', " ") segment = segment.replace('(', "") segment = segment.replace(')', "") segment = segment.replace('_', "") segment = segment.replace('[', "") segment = segment.replace(']', "") segment = segment.replace('*', "") segment = segment.replace('&', "") longString = longString + segment longString = tc.killgremlins(longString) longString = longString.lower() longString = longString.split(" ") a = f.replace(os.getcwd() + '/bible/', "") total = 0 for b in longString: if b.strip() != "": total += 1 if re.sub("[\d]+", "", b) == "": continue if b not in words.keys(): words[b] = {} words[b]["docs"] = [] if a not in words[b]["docs"]: words[b]["docs"].append(a) if b not in docs[a].keys(): docs[a][b] = {} docs[a][b]["ct"] = 1 elif b in docs[a].keys(): docs[a][b]["ct"] = docs[a][b]["ct"] + 1 docs[a]['total'] = total for keys in docs[a].keys(): if keys != "total": num = int(docs[a][keys]["ct"]) demon = int(docs[a]['total']) x = num / demon #print("Doc: " + str(docs[a][keys]) + " -- " + str(x)) #print("Total: " + str(docs[a]['total'])) docs[a][keys]["freq"] = round(x, 20)
def extract_tweets(text): ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing''' ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) ''' print("Extracting Tweets") errors = 0 raw_tweets = text tweets = [] for t in raw_tweets.readlines(): t = textclean.killgremlins(t) try: tw = json.loads(t) if "text" in tw: tweets.append(tw["text"]) except: errors = errors + 1 continue print("Number of Tweet Errors: " + str(errors)) return tweets, errors
raw_tweets = text if isinstance(raw_tweets, list): input('list check wait wait') for i in raw_tweets: try: extract_data(i) except Exception, e: print str(e) input('first order error') errors = errors + 1 continue else: for t in raw_tweets: t = textclean.killgremlins(t) #for dubugging #print(t) #input("wait") try: tw = json.loads(t) #for dubugging #pprint.pprint(tw) #input("wait") extract_data(tw) except: errors = errors + 1 continue