コード例 #1
0
def extract_tweet_URLs(text):
    ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing'''
    ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) '''
    print("Extracting URLs")

    errors = 0
    raw_tweets = text

    tweets = []
    for t in raw_tweets:
        t = textclean.killgremlins(t)

        try:
            tw = json.loads(t)
            if "text" in tw:
                url = findURLs(tw["text"])

                if (url != []):
                    for u in url:
                        tweets.append(u.replace('"', ""))
        except:
            # print(sys.exc_info()[0])

            errors = errors + 1
            continue

    print("Number of URL Errors: " + str(errors))
    return tweets, errors
コード例 #2
0
def extract_features(text):
    ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing'''
    ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) '''
    print("Extracting Tweets")
    languages = {}
    errors = 0
    raw_tweets = text
    text = []
    tweets = []
    turls = []
    for t in raw_tweets:
        t = textclean.killgremlins(t)

        try:
            tw = json.loads(t)
            if "text" in tw:
                tweets.append(tw)
                text.append(tw["text"])
                url = findURLs(tw["text"])
                if (url != []):
                    for u in url:
                        turls.append(u.replace('"', ""))
            if "lang" in tw:
                if tw["lang"] in languages:
                    languages[tw["lang"]] += 1
                else:
                    languages[tw["lang"]] = 1
        except:
            errors = errors + 1
            continue

    print("Number of Tweet Errors: " + str(errors))
    return tweets, errors
コード例 #3
0
def getWords(files):
    for f in files:

        with open(f, "r") as oFile:
            longString = ""
            for segment in oFile.readlines():
                segment = segment.replace("\n", " ")
                segment = segment.replace("\r", " ")
                segment = segment.replace("\n\r", " ")
                segment = segment.replace("\r\n", " ")
                segment = segment.replace("'", "")
                segment = segment.replace(":", "")
                segment = segment.replace("!", "")
                segment = segment.replace("?", "")
                segment = segment.replace(".", "")
                segment = segment.replace(",", "")
                segment = segment.replace(";", "")
                segment = segment.replace("-", " ")
                segment = segment.replace('"', " ")
                segment = segment.replace('(', "")
                segment = segment.replace(')', "")
                segment = segment.replace('_', "")
                segment = segment.replace('[', "")
                segment = segment.replace(']', "")
                segment = segment.replace('*', "")
                segment = segment.replace('&', "")
                longString = longString + segment

            longString = tc.killgremlins(longString)
            longString = longString.lower()
            longString = longString.split(" ")
            a = f.replace(os.getcwd() + '/bible/', "")
            total = 0
            for b in longString:
                if b.strip() != "":
                    total += 1
                    if re.sub("[\d]+", "", b) == "": continue
                    if b not in words.keys():
                        words[b] = {}
                        words[b]["docs"] = []
                    if a not in words[b]["docs"]:
                        words[b]["docs"].append(a)
                    if b not in docs[a].keys():
                        docs[a][b] = {}
                        docs[a][b]["ct"] = 1
                    elif b in docs[a].keys():
                        docs[a][b]["ct"] = docs[a][b]["ct"] + 1
            docs[a]['total'] = total
            for keys in docs[a].keys():
                if keys != "total":

                    num = int(docs[a][keys]["ct"])
                    demon = int(docs[a]['total'])
                    x = num / demon
                    #print("Doc: " + str(docs[a][keys]) + " -- " + str(x))
                    #print("Total: " + str(docs[a]['total']))
                    docs[a][keys]["freq"] = round(x, 20)
コード例 #4
0
def extract_tweets(text):
    ''' returns a list of strings from raw tweet block and an integer representing errors encountered while parsing'''
    ''' usage: tweets, errors = tweet_extractor.extract_tweets(raw_text) '''
    print("Extracting Tweets")

    errors = 0
    raw_tweets = text

    tweets = []
    for t in raw_tweets.readlines():
        t = textclean.killgremlins(t)

        try:
            tw = json.loads(t)
            if "text" in tw:
                tweets.append(tw["text"])
        except:
            errors = errors + 1
            continue

    print("Number of Tweet Errors: " + str(errors))
    return tweets, errors
コード例 #5
0
    raw_tweets = text
    if isinstance(raw_tweets, list):
        input('list check wait wait')
        for i in raw_tweets:

            try:
                extract_data(i)
            except Exception, e:
                print str(e)
                input('first order error')
                errors = errors + 1
                continue
    else:
        for t in raw_tweets:

            t = textclean.killgremlins(t)
            #for dubugging
            #print(t)
            #input("wait")
            try:
                tw = json.loads(t)

                #for dubugging
                #pprint.pprint(tw)
                #input("wait")
                extract_data(tw)

            except:
                errors = errors + 1
                continue