Python Tokenizerの例、happyfuntokenizing.Tokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: tweegion.py プロジェクト: jgontrum/Tweegion

 def __geo_to_dict(self, filename, stopwords):
     counter = 0
     id_to_geotok = dict()
     tok = Tokenizer(preserve_case=False)
     geo_functions = geo.GeoFunctions()
     with codecs.open(filename, 'r', "utf-8") as json_file:
         for line in json_file:
             try:
                 json_data = json.loads(line, 'utf-8')
                 tweet_id = json_data['id']
                 tweet = json_data['text']
                 coordinates = json_data['geo']['coordinates']
                 region = geo_functions.get_region((float(coordinates[0]),float(coordinates[1])))
                 # Stopworte entfernen
                 if region != -1:
                     tokenized_tweet = tok.tokenize(tweet)
                     id_to_geotok[tweet_id] = (
                         [token for token in tokenized_tweet if token not in stopwords],
                         region)
                 counter += 1
                 # if counter % 1000 == 0:
                     # sys.stdout.write('- ')
             except:
                 None
     return id_to_geotok

コード例 #2

0

ファイルを表示

ファイル: tweegion.py プロジェクト: jgontrum/Tweegion

 def __classify_tweet(self,tweet_text):
     tweet_vector = array([0.0,0.0,0.0,0.0,0.0,0.0,0.0])
     tok = Tokenizer(preserve_case=False)
     for token in tok.tokenize(tweet_text):
         if token in self.__wv:
             tweet_vector += self.__wv[token]
     if self.__cosine_sim(tweet_vector, self.__average_distribution) > self.__sim_threshold:
         return None
     tweet_vector_normalized = self.__normalize_len(tweet_vector)
     tweet_vector_diff = tweet_vector_normalized - self.__average_distribution
     return tweet_vector_diff

コード例 #3

0

ファイルを表示

ファイル: DataHandler.py プロジェクト: MarioPiergallini/HoodUpCodes

 def __init__(self, dataFile, usersData):
   self.__data = []
   self.__vocab = dd(int)
   self.__vocabDocCount = dd(int)
   self.__backGround = {}
   self.__commWiseIndices = {}
   self.__commWiseTimeSplitIndices = {}
   self.__communutyWiseVocab = dd(lambda:dd(int))
   self.__users = set()
   self.__userWiseIndices = {}
   self.__userWiseTimeSplitIndices = {}
   self.__timeWiseUserSplitIndices = dd(lambda:dd(int))
   self._tok = Tokenizer(preserve_case=False)
   self.__userJoins = dd(lambda:-1)
   self.timeHandler = TimeHandler()
   self.sampledUsers = set()
   self.activeForums = {}
   self.activeUsersInForums = dd(set)
   
   ## Processing/dealing with data
   #self.__read(dataFile)
   self.__justRead(dataFile)
   self.__loadUsersJoins(usersData)
   self.__splitUserWise()
   self.__userWiseTimeSplit()
   #self.__timeWiseUserSplit()
   #self.__commWiseTimeSplit()
   
   ## Extra data structures
   self.postingFreq = dd(int)

コード例 #4

0

ファイルを表示

ファイル: SamplePosts.py プロジェクト: phanigadde/CSRelated

 def __init__(self, dataFile):
   self.__data = []
   self.__commWiseIndices = {}
   self.__commWiseSampleIndices = {}
   self.__commWiseSampleWordFreq = dd(lambda:dd(int))
   self.__read(dataFile)
   self._tok = Tokenizer(preserve_case=False)

コード例 #5

0

ファイルを表示

ファイル: tweegion.py プロジェクト: jgontrum/Tweegion

 def __jsons_to_dict(self, tweet_file, stopwords):
     counter = 0
     id_to_tok = dict()
     tok = Tokenizer(preserve_case=False)
     with codecs.open(tweet_file, 'r', "utf-8") as json_file:
         for line in json_file:
             try:
                 tweet = json.loads(line, 'utf-8')['text']
                 tweet_id = json.loads(line, 'utf-8')['id']
                 tokenized_tweet = tok.tokenize(tweet)
                 # Stopworte entfernen
                 id_to_tok[tweet_id] = [token for token in tokenized_tweet if token not in stopwords]
                 counter += 1
                 # if counter % 1000 == 0:
                     # sys.stdout.write('+ ')
             except:
                 None
     return id_to_tok

コード例 #6

0

ファイルを表示

ファイル: tweet_wc.py プロジェクト: jgontrum/Tweegion

def read_and_count():
    dictionary = {}
    tweetfolder = '/home/gontrum/april-corpus-raw'

    tok = Tokenizer(preserve_case=False)
    
    for tweetfile in [folder for folder in os.listdir(tweetfolder) if folder.startswith('tweets') == True ]:
        tweetfile = os.path.join(tweetfolder, tweetfile)
    with open(tweetfile, 'r') as f:
            for line in f:
                try:
                    tw = json.loads(line, 'latin1')['text']
                except:
                    None
                for each in tok.tokenize(tw):
                    dictionary[each] = dictionary.get(each, 0) + 1

    return dictionary

コード例 #7

0

ファイルを表示

ファイル: RegressionAnalysis.py プロジェクト: phanigadde/CSRelated

 def __init__(self, dataFile):
   self.__data = []
   self.__vocab = dd(int)
   self.__vocabDocCount = dd(int)
   self.__backGround = {}
   self.__commWiseIndices = {}
   self.__communutyWiseVocab = dd(lambda:dd(int))
   self._tok = Tokenizer(preserve_case=False)
   self.__read(dataFile)

コード例 #8

0

ファイルを表示

ファイル: SentimentAnalysis.py プロジェクト: replykushagra/SentimentAnalysis

def GetWordDictionary(filePAth):
    csv.field_size_limit(sys.maxsize)
    ifile  = open(filePAth, "rb")
    reader = csv.reader(ifile)
    word_dictionary={}       
    tok = Tokenizer(preserve_case=False)    
    for row in reader:
        tokens=[]
        try:
            tokens=tok.tokenize(row[3])
        except Exception,e:
            print e
        for token in tokens:
            if token in word_dictionary:
                token_count=word_dictionary.get(token)
                token_count=token_count+1
                word_dictionary[token]=token_count
            else:
                word_dictionary[token]=1

コード例 #9

0

ファイルを表示

ファイル: UserwiseFakeInstance.py プロジェクト: MarioPiergallini/HoodUpCodes

 def __init__(self):
   self.posts = []
   self.userwiseThreads = dd(set)
   self.userwisePosts = dd(set) # Stores indices
   self.threads = dd(list)
   self.userNames = {}
   self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b")
   self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b")
   self.tok = Tokenizer()
   self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|'])
   self.fakeUsers = {} # Stores the postId of the previous fake annotation we did

コード例 #10

0

ファイルを表示

ファイル: importDatasiftDataToMongoDB.py プロジェクト: SoSweetProject/jpmOldStuffs

def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.data_dir[-1] != '/':
        args.data_dir+='/'

    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[args.database]
    logger.info("Connected to%s"%uri)

    files = glob.glob(args.data_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("done.")

コード例 #11

0

ファイルを表示

ファイル: RegressionAnalysis.py プロジェクト: phanigadde/CSRelated

 def __init__(self, dataFile, usersData):
   sys.stderr.write('In Constructor\n')
   self.__data = []
   self.__userJoins = dd(lambda:-1)
   self.__vocab = dd(int)
   self.__vocabDocCount = dd(int)
   self.__backGround = {}
   self.__commWiseIndices = {}
   self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list))
   self.__communutyWiseVocab = dd(lambda:dd(int))
   self._tok = Tokenizer(preserve_case=False)
   self.__users = set()
   self.__read(dataFile)
   self.__loadUsersJoins(usersData)
   self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}

コード例 #12

0

ファイルを表示

ファイル: SentimentAnalysis.py プロジェクト: replykushagra/SentimentAnalysis

def NaiveBesianClassifer(positive_word_frequency,
                         negative_words_frequency,
                         count_pos_words,
                         count_neg_words,
                         tweet,
                         class_pos_prob,
                         class_neg_prob):
    tok = Tokenizer(preserve_case=False)   
    tokens=tok.tokenize(tweet)
    positiveClassProb=1.00
    negativeClassProb=1.00    
            
    for token in tokens:
        positiveClassProb=positiveClassProb*LaplaceSmoothingValue(token,positive_word_frequency,count_pos_words)
        negativeClassProb=negativeClassProb*LaplaceSmoothingValue(token,negative_words_frequency,count_neg_words)
    positiveClassProb=positiveClassProb*class_pos_prob
    negativeClassProb=negativeClassProb*class_neg_prob            
    
    if(positiveClassProb >=negativeClassProb):
        print positiveClassProb,1
        return positiveClassProb,1
    else:
        print negativeClassProb,1
        return negativeClassProb,-1

コード例 #13

0

ファイルを表示

ファイル: SamplingCodeForMario.py プロジェクト: MarioPiergallini/HoodUpCodes

class DataSampler():
  
  def __init__(self, dataFile):
    self.__data = []
    self.__commWiseIndices = {}
    self.__commWiseSampleIndices = {}
    self.__read(dataFile)
    self._tok = Tokenizer(preserve_case=False)
    
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      record = filter(lambda x:x.strip(), record)
      try:
        self.__data.append(tuple(record))
        tempDD[record[3]].append(index)
        index+=1
      except:
        pass
        ##print record
        ##sys.exit()
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
        
  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def communityWiseSample(self):
    numPosts = 1000
    for key in self.__commWiseIndices.iterkeys():
      self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts)
      
  def prepareOutput(self, outputFile):
    outputFile = open(outputFile,'w')
    csvWriter = csv.writer(outputFile)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        #tokens = self._tokenize(self.__data[index][1])
        csvWriter.writerow(self.__data[index])
    outputFile.close()

コード例 #14

0

ファイルを表示

ファイル: DivergenceAnalysisBase.py プロジェクト: phanigadde/CSRelated

    def __init__(self, dataFile, usersData):
        self.__data = []
        self.__vocab = dd(int)
        self.__vocabDocCount = dd(int)
        self.__backGround = {}
        self.__commWiseIndices = {}
        self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list))
        self.__communutyWiseVocab = dd(lambda: dd(int))
        self.__users = set()
        self.__userWiseIndices = {}
        self._tok = Tokenizer(preserve_case=False)
        self.__userJoins = dd(lambda: -1)
        self.__read(dataFile)
        self.__loadUsersJoins(usersData)
        self.__splitUserWise()

        self.__timeHandler = TimeHandler()

コード例 #15

0

ファイルを表示

class Classifier(nltk.ClassifierI):
    normalizer = N1()
    tokenizer = Tokenizer()

    def __init__(self,modelfile="model.unigram.nb.bool.politics.unbiased"):
        
        modelpath = os.path.join(os.path.abspath(os.path.dirname(__file__)),"models", modelfile) 
        f = open(modelpath,'rb')
        self.model = pickle.load(f)
        f.close()
    def classify(self,features):
        return self.model.classify(features)
    def prob_classify(self,features):
        return self.model.prob_classify(features)
    def labels(self):
        return self.model.labels()
    def valence(self,features):
        hyp = self.model.classify(features)
        posterior = self.model.prob_classify(features)
        # print "valence=%f"%(posterior.prob("positive")/(1-posterior.prob("positive")) - posterior.prob("negative")/(1-posterior.prob("negative")))
        # print posterior.prob("positive")/(1-posterior.prob("positive")) 
        # print posterior.prob("negative")/(1-posterior.prob("negative"))
        if hyp == "negative":
            valence = - posterior.prob("negative")
        elif hyp == "positive":
            valence = posterior.prob("positive")
        else:
            valence = 0
        return valence

    def classifyFromText(self,text):
        def features(text,n=1):
            feats = defaultdict(bool)
            words = ['<s>'] + self.normalizer.normalize(self.tokenizer.tokenize(text)) + ['</s>']
            for i in range(len(words)):
                for j in range(i + 1, i + n + 1):
                    feat = " ".join(words[i:j])
                    feats[feat] = True
            return feats
        features = features(text)
        label = self.model.classify(features)
        valence = self.valence(features)
        post =  self.prob_classify(features)
        return label, valence, post

コード例 #16

0

ファイルを表示

ファイル: processHoodup.py プロジェクト: phanigadde/CSRelated

 def __init__(self):
   self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup')
   self.tok = Tokenizer()

コード例 #17

0

ファイルを表示

    for u in chat_utterances:
        if u.get('class') == a:
            print "Example of {}: {}".format(a, u.text)
            break

# This kind of language is pretty different from the edited writing that many NLP tools assume.  Obviously, for machine learning, it hardly matters what the input to the classifier is.  But it does pay to be smarter about dividing the text up into its tokens (the words or other meaningful elements).  So we'll load in [the tokenizer that Chris Potts wrote][1] to analyze twitter feeds.  Some of the things that it does nicely:
# - Handles emoticons, hashtags, twitter user names and other items that mix letters and punctuation
# - Merges dates, URLs, phone numbers and similar items into single tokens
# - Handles ordinary punctuation in an intelligent way as well
#
# [1]:http://sentiment.christopherpotts.net/tokenizing.html

# In[11]:

from happyfuntokenizing import Tokenizer
chat_tokenize = Tokenizer(preserve_case=False).tokenize

# Now we set up the features for this data set.  The code is closely analogous to what we did with the sentiment classifier earlier.  The big difference is the tokenization and stopword elimination.  Content-free words and weird punctuation bits like `what` and `:)` are going to be very important for understanding what dialogue act somebody is performing so we need to keep those features around!

# In[12]:


def chat_feature_generator(category):
    return (word for post in chat_utterances if post.get('class') == category
            for word in chat_tokenize(post.text))


best_act_words = compute_best_features(dialogue_acts, chat_feature_generator,
                                       2000)

コード例 #18

0

ファイルを表示

ファイル: preprocessor.py プロジェクト: phanigadde/CSRelated

 def __init__(self, tweetsFile):
   self._tweets = []
   self._tok = Tokenizer(preserve_case=False)
   self.loadTweets(tweetsFile)
   sys.stderr.write("preprocessor instance created\n")
   sys.stderr.write("@ Mentions removed\n")

コード例 #19

0

ファイルを表示

ファイル: preprocessor.py プロジェクト: phanigadde/CSRelated

class preprocessor:
  def __init__(self, tweetsFile):
    self._tweets = []
    self._tok = Tokenizer(preserve_case=False)
    self.loadTweets(tweetsFile)
    sys.stderr.write("preprocessor instance created\n")
    sys.stderr.write("@ Mentions removed\n")
    
  def anonnimize(self, tweet):
    tweet = tweet.split('\t')[-1] ## Assumption about the format
    tweet = self._tok.tokenize(tweet) ## Tokenization
    anonTweet = []
    for word in tweet:
      if word[0] != '@':
        anonTweet.append(word)
    return anonTweet
    
  def loadTweets(self, tweetsFile):
    for tweet in open(tweetsFile):
      tweet = tweet.strip()
      tokenizedTweet = self.anonnimize(tweet)
      if len(tokenizedTweet) == 0 or ' '.join(tokenizedTweet).strip() == '':
        continue
      self._tweets.append([tweet.split('\t')[0], tweet.split('\t')[1], tokenizedTweet])
      
  def removeRetweets(self):
    newTweets = []
    for tweet in self._tweets:
      flag = 0
      for word in tweet[2]:
        if word[:2] == 'rt':
          flag = 1
          break  
      if flag == 0:
        newTweets.append(tweet)
    self._tweets = [t for t in newTweets]
    sys.stderr.write("Retweets removed\n")
    
  def filterAuthors(self):
    authorDict = dd(int)
    for tweet in self._tweets:
      authorDict[tweet[0]] += 1
    
    filteredAuthors = []
    for auth,tweets in authorDict.iteritems():
      if tweets >= 50:
        filteredAuthors.append(auth)
    filteredAuthors = set(filteredAuthors)
    filteredTweets = []
    for tweet in self._tweets:
      if tweet[0] in filteredAuthors:
        filteredTweets.append(tweet)
    
    self._tweets = [t for t in filteredTweets]
    
  def authorStats(self):
    authorDict = dd(int)
    for tweet in self._tweets:
      authorDict[tweet[0]] += 1
      
    numDict = dd(int)
    for auth, numTweets in authorDict.iteritems():
      numDict[numTweets-(numTweets%10)] += 1
      
    self.drawGraph(numDict)
    
  def drawGraph(self, authorDict):
    #try:
      authors = [x for x in authorDict.iterkeys()]
      authors = sorted(authors, cmp=lambda x,y:x-y)
      numTweets = [authorDict[x] for x in authors]
      width = 0.2
      fig = plt.figure()
      ax = fig.add_subplot(111)
      # bar chart of the data
      rects = ax.bar(np.arange(len(authors)), numTweets, width, color='r')
      ax.set_xlabel('User')
      ax.set_ylabel('Number of tweets')
      ax.set_xticks(np.arange(len(authors))+width/2)
      ax.set_xticklabels( map(lambda x:str(x), authors))
      
      def autolabel(rects):
        # attach some text labels
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height),
                    ha='center', va='bottom')
      
      autolabel(rects)
      plt.savefig(open("/usr0/home/pgadde/Work/Ethnic/AAEness/Data/RealTweets/PreProcessing/aaeAuthorTweets.png","w"))
      plt.show()
      
  def printInFile(self, output, label):
    output = open(output,'w')
    for tweet in self._tweets:
      tweet[2].insert(0,label)
      try:
        output.write(tweet[0]+"\t"+tweet[1]+"\t"+"\t".join(tweet[2])+"\n")
      except UnicodeEncodeError:
        pass
    output.close()

コード例 #20

0

ファイルを表示

ファイル: RegressionAnalysis.py プロジェクト: phanigadde/CSRelated

class DataProcessor:
  def __init__(self, dataFile):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__read(dataFile)

  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    comm = record[3]
    if comm.find('Talk')<0:
      return
    text = record[1]
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    ##print self.__vocab
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        self.__data.append(tuple(record))
        self.__updateVocab(record)
        tempDD[record[3]].append(index)
        index+=1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    sys.stderr.write("Word types "+str(len(self.__vocab))+"\n")
    
  def preprocessVocab(self):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >=5 and self.__vocabDocCount[word]>=5:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")

コード例 #21

0

ファイルを表示

def init(preserve_case=True):
    global tokenizer
    tokenizer = Tokenizer(preserve_case=preserve_case)

コード例 #22

0

ファイルを表示

ファイル: FakeInstances.py プロジェクト: MarioPiergallini/HoodUpCodes

class FakeMatcher:
  def __init__(self):
    self.posts = []
    self.userwiseThreads = dd(lambda:dd(lambda:-1))
    self.userwisePosts = dd(set) # Stores indices
    self.userLastPost = dd(lambda:-1)
    self.threads = dd(list)
    self.userStart = dd(lambda:5000)
    self.userNames = {}
    self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b")
    self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b")
    self.tok = Tokenizer()
    self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|'])
    
  def loadData(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    postIndex = 0
    for line in reader:
      self.posts.append(line)
      thread = line[3]
      user = line[1]
      username = line[0]
      self.userNames[user] = ' '.join(self.tok.tokenize(username))
      self.threads[thread].append(postIndex)
      if self.userwiseThreads[user][thread] < 0 or self.userwiseThreads[user][thread] > postIndex:  
        self.userwiseThreads[user][thread] = postIndex
      self.userwisePosts[user].add(postIndex)
      days = int(line[8])
      if self.userLastPost[user] < days:
        self.userLastPost[user] = days
      if self.userStart[user] > int(line[8]):
        self.userStart[user] = int(line[8])
      postIndex += 1
    self.sortThreads()
  
  def sortThreads(self):
    for thread in self.threads.iterkeys():
      self.threads[thread] = sorted(self.threads[thread], cmp=lambda x, y:x - y)
  
  def filterUsers(self):
    allUsers = self.userwisePosts.keys()
    for user in allUsers:
      if len(self.userwisePosts[user]) < 20 or len(self.userwisePosts[user]) > 150 or (self.userStart[user] - self.userLastPost[user]) > 120:
        del self.userwisePosts[user]
        del self.userwiseThreads[user]
        del self.userNames[user]
  
  def hasFake(self, postId):
    postText = self.posts[postId][4]
    #if postText.find(" you a fake ")>=0:
    #  print postText
    return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None)
  
  def printFakePosts(self, logFile):
    logFile = open(logFile, 'w')
    index = 0
    for post in self.posts:
      if self.hasFake(index):
        logFile.write('\t'.join(post[:5]) + '\n')
      index += 1 
  
  def printFakeUsers(self, fakersFile):
    fakersFile = open(fakersFile, 'w', 1)
    for user in self.userwiseThreads.iterkeys():
      fakePostCount = 0
      fakePostIds = set()
      for thread in self.userwiseThreads[user].iterkeys():
        userFirstPost = self.userwiseThreads[user][thread]
        postIndex = self.threads[thread].index(userFirstPost) + 1
        while postIndex < len(self.threads[thread]):
          postId = self.threads[thread][postIndex]
          if self.hasFake(postId):
            #print 'here'
            fakePostCount += 1
            fakePostIds.add(postId)
          postIndex += 1
      if fakePostCount > 5:
        fakersFile.write(user + '\t' + ' '.join(map(lambda x:str(x), list(fakePostIds))) + '\n')
    fakersFile.close()
  
  def makeRECompatible(self, userName):
    for char in self.badChars:
      if char != '\\':
        userName = userName.replace(char, "\\" + char)
    return userName
  
  def bigRESearch(self, logFile):
    logFile = open(logFile, 'w', 1)
    bigUserName = "******"
    for userName in self.userNames.itervalues():
      if userName in ["dat n***a", "bitch"]:
        continue
      if userName.strip() != "":
        if self.considerUserName(userName): 
          userName = self.makeRECompatible(userName)
          bigUserName += userName + " |"
    bigUserName = bigUserName[:-1] + ")"
    bigUserName += "(is )(a )?(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)"
    print len(bigUserName)
    print bigUserName
    P = re.compile(bigUserName)
    #sampleText = "i wanna see wat dat n***a about but i aint gonna fite him im on parole . but dat n***a fake so i dont even matter"
    #while 1:
    #  sampleText = raw_input("Enter the text: ")
    #  if sampleText == 'exit':
    #    break
    #  print "Full match:",P.search(sampleText).group(), " username match:",P.search(sampleText).group(1)
    for post in self.posts:
      text = post[4]
      if P.search(text) != None:
        logFile.write('\t'.join(post[:5]) + '\n')
    logFile.close()
  
  def printNonChars(self):
    nonChars = set()
    for userName in self.userNames.itervalues():
      userName = userName.lower()
      for char in userName:
        if ord(char) >= 32 and ord(char) <= 126 and (ord(char) < 97 or ord(char) > 122) and ord(char) not in range(48, 58):
          nonChars.add(char)
    print "Users:", len(self.userNames)
    print nonChars
  
  def contentToLookAt(self):
    uniqThreads = set()
    uniqPosts = set()
    for userId in self.userNames.iterkeys():
      for thread in self.userwiseThreads[userId]:
        uniqThreads.add(thread)
        for post in self.threads[thread]:
          uniqPosts.add(post)
    print "Users to look at:", len(self.userNames)
    print "Unique threads to look at:", len(uniqThreads)
    print "Unique posts to look at:", len(uniqPosts)
  
  def isAllLetters(self, userName):
    for char in userName:
      if ord(char) < 97 or ord(char) > 122:
        return False
    return True
  
  def considerUserName(self, userName):
    for char in userName:
      o = ord(char)
      if o < 32 or o > 126:
        return False
    return True
  
  def matchUserNamesInPosts(self, logFile):
    logFile = open(logFile, 'w', 1)
    for userId in self.userNames.iterkeys():
      userName = self.userNames[userId]
      if not self.isAllLetters(userName):
        continue
      for post in self.posts:
        if post[4].find(userName) >= 0:
          logFile.write(str(userId) + '\t' + userName + '\t' + post[4] + '\n')
    logFile.close()

コード例 #23

0

ファイルを表示

ファイル: importSnapshotToMongoDB.py プロジェクト: SoSweetProject/jpmOldStuffs

def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.snapshot_dir[-1] != '/':
        args.snapshot_dir+='/'

    if args.database:
        database = args.database
    else:
        database = "snapshot_"+args.snapshot_dir.split('/')[-2]
    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[database]
    logger.info("Connected to%s"%uri)


    files = glob.glob(args.snapshot_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("Loading users from %susers.db"%args.snapshot_dir)
    connection = sqlite3.connect("%susers.db"%args.snapshot_dir)
    connection.row_factory = sqlite3.Row
    cursor = connection.cursor()
    logger.info('fetching users')
    cursor.execute('SELECT id,friends FROM users where friends is not NULL')
    users = cursor.fetchall()
    logger.info('%d users fetched'%len(users))
    bulk_size=25000
    nUsersInserted=0
    usersToBeInserted=[]
    for user in users:
        id = user['id']
        friends = ujson.loads(user['friends'])
        usersToBeInserted.append({'id':id, 'friends':friends})
        if len(usersToBeInserted)>=bulk_size:
            client['users'].insert(usersToBeInserted)
            usersToBeInserted=[]
            nUsersInserted+=bulk_size
            logger.info("%d users insered"%nUsersInserted)
    client['users'].insert(usersToBeInserted)
    logger.info("all users insered.")

    logger.info("done.")

コード例 #24

0

ファイルを表示

ファイル: empiricalLogOddsHoodup.py プロジェクト: phanigadde/CSRelated

class EmpiricalAnalyzer:
    def __init__(self, dataFile):
        self.__data = []
        self.__vocab = dd(int)
        self.__vocabDocCount = dd(int)
        self.__backGround = {}
        self.__commWiseIndices = {}
        self.__communutyWiseVocab = dd(lambda: dd(int))
        self._tok = Tokenizer(preserve_case=False)
        self.__read(dataFile)

    def _tokenize(self, text):
        text = text.strip()
        text = re.sub("[\s\n]+", " ", text)
        return self._tok.tokenize(text)

    def freqVector(self, tokens):
        tempFreqVector = dd(int)
        for token in tokens:
            tempFreqVector[token] += 1
        return tempFreqVector

    def __updateVocab(self, record):
        comm = record[3]
        if comm.find("Talk") < 0:
            return
        text = record[1]
        if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
            return 0
        tokenDict = self.freqVector(self._tokenize(text))
        for word, freq in tokenDict.iteritems():
            self.__vocab[word] += freq
            self.__communutyWiseVocab[comm][word] += freq
            self.__vocabDocCount[word] += 1
        return 1
        ##print self.__vocab

    def __read(self, dataFile):
        dataFile = open(dataFile)
        dataFile.readline()
        csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
        index = 0
        tempDD = dd(list)
        for record in csvReader:
            try:
                self.__data.append(tuple(record))
                succ = self.__updateVocab(record)
                if succ:
                    tempDD[record[3]].append(index)
                index += 1
            except:
                pass
        for key, value in tempDD.iteritems():
            if key.find("Talk") >= 0:
                self.__commWiseIndices[key] = value
        sys.stderr.write("Read " + str(index) + " records\n")
        sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")

    def preprocessVocab(self):
        stopWords = [w.strip() for w in open("stopWords")]
        self.__backGround = {}
        totalVocab = self.__vocab.keys()
        for word in totalVocab:
            freq = self.__vocab[word]
            if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
                self.__backGround[word] = freq
            else:
                del self.__vocab[word]

        totalWords = 0
        for word, freq in self.__backGround.iteritems():
            totalWords += freq
        for word, freq in self.__backGround.iteritems():
            self.__backGround[word] = self.__backGround[word] / float(totalWords)

        for comm in self.__communutyWiseVocab.iterkeys():
            commVocab = self.__communutyWiseVocab[comm].keys()
            totalWords = 0
            for word in commVocab:
                if word in self.__vocab:
                    totalWords += self.__communutyWiseVocab[comm][word]
                    continue
                del self.__communutyWiseVocab[comm][word]
            for word in self.__communutyWiseVocab[comm].iterkeys():
                self.__communutyWiseVocab[comm][word] = self.__communutyWiseVocab[comm][word] / float(totalWords)
        sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

    def printTop1000InBack(self, outFile):
        outFile = open(outFile, "w")
        wordFreqs = [(word, freq) for word, freq in self.__backGround.iteritems()]
        wordFreqs = sorted(wordFreqs, cmp=lambda x, y: y[1] - x[1])[:1000]
        for wordFreq in wordFreqs:
            outFile.write(wordFreq[0] + "\n")
        outFile.close()

    def printTop1000(self, D, outFile):
        outFile = open(outFile, "w")
        wordFreqs = [(word, freq) for word, freq in D.iteritems()]
        wordFreqs = sorted(wordFreqs, cmp=myCMP)
        for wordFreq in wordFreqs:
            outFile.write(wordFreq[0] + "\t" + str(wordFreq[1]) + "\n")
        outFile.close()

    def __logOdd(self, word, commFreq):
        try:
            return math.log(commFreq * 1.0 / self.__backGround[word])
        except ZeroDivisionError:
            return 100000
        except ValueError:
            return -100000
        except:
            print word, commFreq, self.__backGround[word]
            sys.exit("Error while calculating logodds")

    def prepareCommunityWiseVocab(self):
        for word in self.__backGround.iterkeys():
            for comm in self.__communutyWiseVocab.iterkeys():
                self.__communutyWiseVocab[comm][word] = self.__logOdd(word, self.__communutyWiseVocab[comm][word])

    def printTopDeviations(self, baseDir):
        backFile = baseDir + "/" + "background"
        self.printTop1000(self.__backGround, backFile)
        for comm in self.__communutyWiseVocab.iterkeys():
            self.printTop1000(self.__communutyWiseVocab[comm], baseDir + "/" + comm.strip().replace(" ", ""))

コード例 #25

0

ファイルを表示

ファイル: DivergenceAnalysisBase.py プロジェクト: phanigadde/CSRelated

class DataHandler:
    def __init__(self, dataFile, usersData):
        self.__data = []
        self.__vocab = dd(int)
        self.__vocabDocCount = dd(int)
        self.__backGround = {}
        self.__commWiseIndices = {}
        self.__commWiseTimeSeparatedIndices = dd(lambda: dd(list))
        self.__communutyWiseVocab = dd(lambda: dd(int))
        self.__users = set()
        self.__userWiseIndices = {}
        self._tok = Tokenizer(preserve_case=False)
        self.__userJoins = dd(lambda: -1)
        self.__read(dataFile)
        self.__loadUsersJoins(usersData)
        self.__splitUserWise()

        self.__timeHandler = TimeHandler()

    def __loadUsersJoins(self, usersData):
        dataFile = open(usersData)
        for line in dataFile:
            line = line.strip().split("\t")
            self.__userJoins[line[0]] = line[1]  ## Correct the indices
        sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

    def __validUserId(self, userId):
        try:
            userId = int(userId)
            assert userId >= 1 and userId <= 45037
            return True
        except:
            return False

    def __splitUserWise(self):
        tempDD = dd(list)
        for index in range(len(self.__data)):
            user = self.__data[index][5]
            if not self.__validUserId(user):
                continue
            tempDD[user].append(index)
        for user in tempDD.iterkeys():
            self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
        del tempDD

    def __read(self, dataFile):
        dataFile = open(dataFile)
        dataFile.readline()
        csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
        index = 0
        tempDD = dd(list)
        for record in csvReader:
            try:
                succ = self.__updateVocab(record)
                if succ:
                    self.__data.append(tuple(record))
                    tempDD[record[3]].append(index)
                    tempDD["AllTalk"].append(index)
                    self.__users.add(record[5])
                index += 1
            except:
                pass
        for key, value in tempDD.iteritems():
            if key.find("Talk") >= 0:
                self.__commWiseIndices[key] = value
        sys.stderr.write("Read " + str(index) + " records\n")
        sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
        sys.stderr.write("Users: " + str(len(self.__users)) + "\n")

    def _tokenize(self, text):
        text = text.strip()
        text = re.sub("[\s\n]+", " ", text)
        return self._tok.tokenize(text)

    def freqVector(self, tokens):
        tempFreqVector = dd(int)
        for token in tokens:
            tempFreqVector[token] += 1
        return tempFreqVector

    def __updateVocab(self, record):
        if len(record) != 7:
            return
        comm = record[3]
        if comm.find("Talk") < 0:
            return 0
        text = record[1]
        if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
            return 0
        tokenDict = self.freqVector(self._tokenize(text))
        for word, freq in tokenDict.iteritems():
            self.__vocab[word] += freq
            self.__communutyWiseVocab[comm][word] += freq
            self.__vocabDocCount[word] += 1
        return 1
        ##print self.__vocab

    def preprocessVocab(self, stopWords):
        self.__backGround = {}
        totalVocab = self.__vocab.keys()
        for word in totalVocab:
            freq = self.__vocab[word]
            if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
                self.__backGround[word] = freq
            else:
                del self.__vocab[word]
        for comm in self.__communutyWiseVocab.iterkeys():
            commVocab = self.__communutyWiseVocab[comm].keys()
            for word in commVocab:
                if word in self.__vocab:
                    continue
                del self.__communutyWiseVocab[comm][word]
        sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

    def getAllUsers(self):
        return copy.deepcopy(self.__users)

    def getUserDataIndices(self, user):
        userDataIndices = []
        for index in range(len(self.__data)):
            userDataIndices.append(index)
        return copy.deepcopy(userDataIndices)

    def divideBasedOnMonths(self, data):
        timeDividedIndices = dd(list)
        for index in data:
            timeDiff = self.__timeDiff(index)
            if timeDiff >= 0:
                timeDividedIndices[timeDiff].append(index)
        return copy.deepcopy(timeDividedIndices)

    def __timeDiff(self, recordIndex):
        try:
            record = self.__data[recordIndex]
            postTime = str(record[4])
            user = str(record[5])
            userJoin = self.__userJoins[user]
            return self.__timeHandler.diffMonths(postTime, userJoin)
        except:
            return -1

    def makeDist(self, data):
        totalWords = 0
        dist = dd(lambda: 1)
        for text in data:  ## I just expect an array of texts, not the entire records
            tokenDict = self.freqVector(self._tokenize(text))
            for word, freq in tokenDict.iteritems():
                if word in self.__vocab:
                    dist[word] += freq
                    totalWords += freq
        for word in self.__vocab:
            dist[word] += 0
        totalWords += len(self.__vocab)
        for word in self.__vocab:
            dist[word] /= float(totalWords)
            ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
        return dist

コード例 #26

0

ファイルを表示

ファイル: SamplingCodeForMario.py プロジェクト: phanigadde/CSRelated

 def __init__(self, dataFile):
   self.__data = []
   self.__commWiseIndices = {}
   self.__commWiseSampleIndices = {}
   self.__read(dataFile)
   self._tok = Tokenizer(preserve_case=False)

コード例 #27

0

ファイルを表示

ファイル: UserwiseFakeInstance.py プロジェクト: MarioPiergallini/HoodUpCodes

class FakeMatcher:
  def __init__(self):
    self.posts = []
    self.userwiseThreads = dd(set)
    self.userwisePosts = dd(set) # Stores indices
    self.threads = dd(list)
    self.userNames = {}
    self.fakeRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(are |r |re |ar |is |be )(a )(fake|faking|faker|netbanger|net banger|fakeass|net-banger|fake-ass)\\b")
    self.noRealRE = re.compile("\\b(you |u |u're |you're |u'r |you'r |your |ur |username )(aren't |ain't |arent |aint |isn't |isnt |are not |is not |not )(no )?real\\b")
    self.tok = Tokenizer()
    self.badChars = set(['$', ')', '(', '+', '*', '-', '.', '<', '?', '>', '[', ']', '^', '|'])
    self.fakeUsers = {} # Stores the postId of the previous fake annotation we did
    
  def loadData(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    reader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    postIndex = 0
    for line in reader:
      self.posts.append(line)
      thread = line[3]
      user = line[1]
      username = line[0]
      self.userNames[user] = ' '.join(self.tok.tokenize(username))
      self.threads[thread].append(postIndex)
      self.userwiseThreads[user].add(thread)  
      self.userwisePosts[user].add(postIndex)
      postIndex += 1
  
  def loadFakeUsers(self, fakeAnnotation):
    fakeAnnotation = csv.reader(open(fakeAnnotation))
    for line in fakeAnnotation:
      try:
        dummy = int(line[1])
        dummy = int(line[2])
      except:
        continue
      self.fakeUsers[line[1]] = int(line[2])
  
  def filterUsers(self):
    allUsers = self.userwisePosts.keys()
    for user in allUsers:
      if user not in self.fakeUsers.iterkeys():
        del self.userwisePosts[user]
        del self.userwiseThreads[user]
        del self.userNames[user]
  
  def hasFake(self, postId):
    postText = self.posts[postId][4]
    return (self.fakeRE.search(postText) != None) or (self.noRealRE.search(postText) != None)
  
  def printFakeUsers(self, fakersDir):
    for user in self.fakeUsers:
      fakePostIds = []
      for thread in self.userwiseThreads[user]:
        for postIndex in self.threads[thread]:
          if self.hasFake(postIndex):
            fakePostIds.append(postIndex)
      fakePostIds = sorted(fakePostIds, cmp=lambda x, y:int(self.posts[x][2]) - int(self.posts[y][2]))
      #print user, self.posts[fakePostIds[0]][2], self.fakeUsers[user]
      if len(fakePostIds) > 0 and self.posts[fakePostIds[0]][2] != str(self.fakeUsers[user]):
        #self.printPosts(user, fakePostIds)
        dummy = 1
      else:
        print user
  
  def printPosts(self, user, fakePostIds):
    fakersFile = open(fakersDir + user, 'w', 1)
    for postIndex in fakePostIds:
      postId = self.posts[postIndex][2]
      postBody = self.posts[postIndex][4]
      fakersFile.write(postId + '\t' + postBody + '\n')
    fakersFile.close()
  
  def sanityCheck(self):
    print "Posts:", len(self.posts)
    print "Users:", len(self.userwiseThreads)
    print "Fake users:", len(self.fakeUsers)
    for user in self.fakeUsers:
      if user not in self.userwiseThreads.iterkeys():
        print user

コード例 #28

0

ファイルを表示

ファイル: DataHandler.py プロジェクト: MarioPiergallini/HoodUpCodes

class DataHandler:
  def __init__(self, dataFile, usersData):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSplitIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self.__users = set()
    self.__userWiseIndices = {}
    self.__userWiseTimeSplitIndices = {}
    self.__timeWiseUserSplitIndices = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__userJoins = dd(lambda:-1)
    self.timeHandler = TimeHandler()
    self.sampledUsers = set()
    self.activeForums = {}
    self.activeUsersInForums = dd(set)
    
    ## Processing/dealing with data
    #self.__read(dataFile)
    self.__justRead(dataFile)
    self.__loadUsersJoins(usersData)
    self.__splitUserWise()
    self.__userWiseTimeSplit()
    #self.__timeWiseUserSplit()
    #self.__commWiseTimeSplit()
    
    ## Extra data structures
    self.postingFreq = dd(int)
  
  def printMonthlyDataForUser(self, user, outFile):
    userTimeIndices = self.__userWiseTimeSplitIndices[user]
    for month in userTimeIndices.iterkeys():
      f = csv.writer(open(outFile+"."+str(month),"w"))
      for index in userTimeIndices[month]:
        f.writerow(self.__data[index])
  
  def tokenizeRecord(self, record):
    record = list(copy.deepcopy(record))
    #print record
    try:
      text = record[1]
      tokenizedText = ' '.join(self._tokenize(text))
      record[1] = tokenizedText
      #print tokenizedText
      return record
    except:
      return -1
  
  def getTokenizedCSV(self):
    tokenizedRecords = []
    for index in range(len(self.__data)):
      newRecord = self.tokenizeRecord(self.__data[index])
      if newRecord != -1:
        tokenizedRecords.append(newRecord)
    return tokenizedRecords
 
  def getBasicUserMonthRecord(self, user, month):
    record = []
    record.append(user)
    record.append(month)
    record.append(self.activeForums[user])
    record.append([])
    return record

  def getTokenizedUserMonthCSV(self):
    tokenizedRecords = dd(lambda:dd(list))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            tokenizedRecords[user][month].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  def getTokenizedUserMonthForumCSV(self):
    tokenizedRecords = dd(lambda:dd(lambda:dd(list)))
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          newRecord = self.tokenizeRecord(self.__data[index])
          if newRecord != -1:
            forum = newRecord[3]
            tokenizedRecords[user][month][forum].append(newRecord[1]) ## Only postBody being given!
    return tokenizedRecords
  
  
  def getPost2Month(self):
    post2Month = {}
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        for index in self.__userWiseTimeSplitIndices[user][month]:
          postId = self.__data[index][0]
          post2Month[postId] = month
    return copy.deepcopy(post2Month)
  
  def getDoc2Post(self):
    doc2Post = {}
    for index in range(len(self.__data)):
      doc2Post[index+1] = self.__data[index][0]
    return copy.deepcopy(doc2Post)
    
  def getPost2User(self):
    post2User = {}
    for user in self.__userWiseIndices.iterkeys():
      for index in self.__userWiseIndices[user]:
        postId = self.__data[index][0]
        post2User[postId] = user
    return copy.deepcopy(post2User)
    
  def getPostingFreq(self):
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      self.postingFreq[len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCumulativePostingFreq(self):
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    self.postingFreq = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      for num in range(0,userPosts+1,10):
        self.postingFreq[num] += 1
    return copy.deepcopy(self.postingFreq)
  
  def getCutoffPostingFreq(self):
    totalPosts = 0
    cdfFreqPosting = dd(int)
    for user in self.__userWiseIndices.iterkeys():
      userPosts = len(self.__userWiseIndices[user])-len(self.__userWiseIndices[user])%10
      totalPosts += userPosts
      for num in range(0,userPosts+1,10):
        cdfFreqPosting[num] += userPosts
    for num in cdfFreqPosting.iterkeys():
      cdfFreqPosting[num] = round(cdfFreqPosting[num]*100.0/float(totalPosts),2)
    sys.stderr.write("Total Users:"+str(len(self.__userWiseIndices))+"\n")    
    sys.stderr.write("Total Posts:"+str(totalPosts)+"\n")
    return copy.deepcopy(cdfFreqPosting)
  
  def getMonthwisePostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      timeWisePostedUsers[time] = len(self.__timeWiseUserSplitIndices[time])
    return copy.deepcopy(timeWisePostedUsers)
  
  def getMonthwiseBinnedPostingFrequency(self):
    timeWisePostedUsers = dd(int)
    for time in self.__timeWiseUserSplitIndices.iterkeys():
      userWiseIndices = self.__timeWiseUserSplitIndices[time]
      postingFreq = dd(int)
      for user in userWiseIndices.iterkeys():
        userPosts = len(self.__userWiseIndices[user])
        for num in range(0,userPosts+1):
          postingFreq[num] += 1
      timeWisePostedUsers[time] = copy.deepcopy(postingFreq)
    return copy.deepcopy(timeWisePostedUsers)
  
  def getBasicTable(self):
    table = []
    for user in self.__userWiseTimeSplitIndices.iterkeys():
      userSubtable = []
      for month in self.__userWiseTimeSplitIndices[user].iterkeys():
        try:
          activeForum = self.activeForums[user]
          if activeForum == 'NULL':
            continue
          if int(month) >100:
            continue
          content = (user, month, len(self.__userWiseTimeSplitIndices[user][month]), self.activeForums[user])
          userSubtable.append(content)
        except:
          pass
      if len(userSubtable) >= 3:
        table.extend(userSubtable)
    return table
  
  def totalPostsByUsers(self):
    total = 0
    for user in self.__userWiseIndices.iterkeys():
      total += len(self.__userWiseIndices[user])
    return total
  
  def getTopPosterCoverage(self):
    totalPosts = self.totalPostsByUsers()
    postsTillTopN = 0
    
    
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

  def loadActiveForums(self, activeForums):
    for line in csv.reader(open(activeForums)):
      try:
        self.activeForums[line[0]] = line[1]
        self.activeUsersInForums[line[1]].add(line[0])
      except:
        pass
  
  def __validUserId(self, userId):
    try:
      userId = int(userId)
      assert userId >= 1 and userId <= 45037
      return True
    except:
      return False

  def __splitUserWise(self):
    tempDD = dd(list)
    for index in range(len(self.__data)):
      try:
        user = self.__data[index][5]
      except:
        continue
      if not self.__validUserId(user):
        continue
      tempDD[user].append(index)
    for user in tempDD.iterkeys():
      self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
    del tempDD

  def __userWiseTimeSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user])
  
  def __timeWiseUserSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      timeDividedUserData = self.divideBasedOnMonths(self.__userWiseIndices[user])
      for time in timeDividedUserData.iterkeys():
        self.__timeWiseUserSplitIndices[time][user] = timeDividedUserData[time]
    return copy.deepcopy(self.__timeWiseUserSplitIndices)
  
  def __commWiseTimeSplit(self):
    for comm in self.__commWiseIndices.iterkeys():
      self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm])
  
  def __justRead(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    for record in csvReader:
      #self.__data.append(tuple(record[1:]))
      self.__data.append(tuple(record))
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        succ = self.__updateVocab(record)
        if succ:
          self.__data.append(tuple(record))
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index += 1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk") >= 0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read " + str(index) + " records\n")
    sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
    sys.stderr.write("Users: " + str(len(self.__users)) + "\n")
    
  def _tokenize(self, text):
    text = text.strip()
    text = re.sub('[\s\n]+', ' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    if len(record)!=7:
      return
    comm = record[3]
    if comm.find('Talk') < 0:
      return 0
    text = record[1]
    if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def preprocessVocab(self, stopWords):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

  def getAllUsers(self):
    return copy.deepcopy(self.__users)

  def userStats(self, outFile):
    outFile = open(outFile,'w')
    for user in self.__userWiseIndices.iterkeys():
      userDataIndices = self.__userWiseIndices[user]
      timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices)
      outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n')
    outFile.close()

  def getUserDataIndices(self, user):
    userDataIndices = []
    for index in range(len(self.__data)):
      userDataIndices.append(index)
    return copy.deepcopy(userDataIndices)
  
  def divideBasedOnMonths(self, data):
    timeDividedIndices = dd(list)
    for index in data:
      timeDiff  = -1
      try:
        timeDiff = self.__timeDiff(index)
      except:
        continue
      if timeDiff >= 0:
        timeDividedIndices[timeDiff].append(index)
      #else:
      #  print timeDiff
    return copy.deepcopy(timeDividedIndices)
    
  def __timeDiff(self, recordIndex):
    #try:
      #print recordIndex
      record = self.__data[recordIndex]
      postTime = str(record[4])
      user = str(record[5])
      userJoin = self.__userJoins[user]
      return self.timeHandler.diffMonths(postTime, userJoin)
    #except:
    #  return -1
  
  def makeDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for text in data: ## I just expect an array of texts, not the entire records
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
    #assert self.isValid(dist)
    return dist

  def isValid(self, dist):
    sumProb = 0
    for x in dist.iterkeys():
      sumProb += dist[x]
    print sumProb
    return True

  def sampleUsers(self):
    US = userSampling(self.__userWiseTimeSplitIndices)
    self.sampledUsers = US.finalizeUsers()
    self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices)
    return copy.deepcopy(self.sampledUsers)

  def getUserMonths(self, user):
    months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys())
    for i in range(1,4):
      try:
        months.remove(i)
      except:
        pass
    for i in range(25,31):
      try:
        months.remove(i)
      except:
        pass
    return months

  def getUserDataForDivergence(self, user, month):
    return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]]

  def getUserInitialData(self, user):
    data = []
    for month in range(1,4):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getUserMaturedData(self, user):
    data = []
    for month in range(25,31):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getActiveForum(self, userNum):
    return self.activeForums[userNum]

  def getForumInitialData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(1,4):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

  def getForumMaturedData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(25,31):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

コード例 #29

0

ファイルを表示

ファイル: processHoodup.py プロジェクト: phanigadde/CSRelated

class ThreadCreator:
  def __init__(self):
    self.__conn = M.connect('localhost', 'phani', 'phani', 'hoodup')
    self.tok = Tokenizer()
    
  def __getMaxPage(self, page):
    soup = BS(open(page).read())
    try:
      pagesTag = int(soup.findAll('a', onclick="jumpto(); return false;")[0].findChildren('strong')[-1].contents[0])
      return pagesTag
    except:
      return -1
  
  def __getPostId(self, postProfile):
    postId = -1
    try:
      postId = int(postProfile.find('dl', {'class':'postprofile'})['id'].split('profile')[1])
    except:
      postId = -1
      pass
    return postId
  
  
  def getSmileyText(self, smileyTag):
    title = smileyTag["title"]
    title = title.lower()
    title.replace(" ", "_")
    title = re.sub("[^a-z_]", "", title)
    return "___" + title + "___"
  
  def constructString(self, contentTag):
    content = ""
    for cont in contentTag.contents:
      if type(cont) == bs4.element.NavigableString:
        content += " " + cont
        #print cont
      elif type(cont) == bs4.element.Tag and cont.name == 'span':
        content += " " + self.constructString(cont)
      elif type(cont) == bs4.element.Tag and cont.name == 'img' and cont["src"].find("./images/smilies/") == 0:
        content += " " + self.getSmileyText(cont)
    return content
  
  def __getPostBody(self, postBodyTag):
    postBody = ""
    try:
      contentDiv = postBodyTag.find('div', {'class':'content'})
      postBody = self.constructString(contentDiv)
      ##for cont in contentDiv:
      ##  #print cont, type(cont)
      ##  if type(cont) == bs4.element.NavigableString:
      ##    postBody += cont
    except:
      pass
    if postBody == "":
      return "NULL"
    
    return postBody.decode('utf8')
  
  def __getUser(self, postProfile):
    user = -1
    try:
      user = int(postProfile.find('a')['href'].split('u=')[1].split('&')[0])
    except:
      user = -1
      pass
    return user
  
  def __getForum(self, soup):
    forum = "NULL"
    try:
      forum = soup.find('li', {'class':'nav-forum active'}).find('a').find('span').contents[0]
    except:
      pass
    return forum
  
  def __getPostTime(self, postBody):
    postTime = "NULL"
    try:
      postTime = ' '.join(postBody.find('p').contents[-1].strip().split(" ")[1:])
    except:
      pass
    return postTime
  
  def __getPostBodyTag(self, postProfile):
    postBody = postProfile.nextSibling
    flag = 0
    while getattr(postBody, 'name', None) != 'div':
      if getattr(postBody, 'name', None) == 'span':
        flag = 1
        break
      postBody = postBody.nextSibling
    if flag:
      None
    return postBody
  
  def getPosts(self, page, tId):
    #print 'In getPosts'
    soup = BS(open(page).read())
    ##print soup
    postProfiles = soup.findAll('div', {'class':"profile"})
    #print "NUM:",len(postProfiles)
    posts = []
    threadId = tId
    forum = self.__getForum(soup)
    for postProfile in postProfiles:
      #print 'Inside post profiles'
      postId = self.__getPostId(postProfile)
      user = self.__getUser(postProfile)
      postBodyTag = self.__getPostBodyTag(postProfile)
      ##print postBodyTag
      if postBodyTag != None:
        time = self.__getPostTime(postBodyTag)
        postBody = self.__getPostBody(postBodyTag)
        postBody = re.sub("\[youtube\].*?\[/youtube\]", "", postBody)
        if postBody.find("quote") > postBody.find("/quote"):
            postBody = postBody[postBody.find("/quote") + 6:]
        #inReply = -1
        
        postBody = ' '.join(self.tok.tokenize(postBody))
        postBody = postBody.replace("\\", "")
        ##print postId, postBody, threadId, forum, time, user,inReply 
        ##sys.exit()
        posts.append((user, postId, threadId, postBody, forum, time))
    return posts

  def getPostsInThread(self, baseDir, fId, tId):
    posts = []
    firstPage = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=0'
    ##os.system('wget -P '+baseDir+' "'+firstPage+'"')
    page = baseDir + firstPage.split('/board/')[1]
    ###print page
    posts.extend(self.getPosts(page, tId))
    ##sys.exit()
    maxPages = self.__getMaxPage(page)
    for pageIndex in range(1, maxPages):
      offset = pageIndex * 50
      url = 'http://thehoodup.com/board/viewtopic.php?f=' + fId + '&t=' + tId + '&start=' + str(offset)
      ##os.system('wget -P '+baseDir+' "'+url+'"')
      ##continue
      page = baseDir + url.split('/board/')[1]
      pagePosts = self.getPosts(page, tId)
      posts.extend(pagePosts)
    return posts
    
  def createThreadsTable(self, baseDir, threads, outFile):
    outFile = open(outFile, 'w')
    writer = csv.writer(outFile, quotechar='"', escapechar="\\")
    ##cursor = self.__conn.cursor()
    for thread in threads:
      fId, tId = thread
      try:
        posts = self.getPostsInThread(baseDir, fId, tId)
      except:
        pass
      #print posts
      #sys.exit()
      ##continue
      try:
        for post in posts:
          writer.writerow(post)   
          ##cursor.execute("""insert into allThreads values(%s,%s,%s,%s,%s,%s,%s)""",post)
      except:
        pass
    ##self.__conn.commit()
    outFile.close()

コード例 #30

0

ファイルを表示

ファイル: tokenizeAndTagTweets.py プロジェクト: SoSweetProject/jpmOldStuffs

import glob
import ujson
from happyfuntokenizing import Tokenizer
from TreeTaggerWrapper import TreeTagger

path_to_data='../data/snapshots/2014-10-20/'
files = glob.glob(path_to_data+'2014-1*.data')

tokenizer = Tokenizer(preserve_case=True)
tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par')


for fileName in files:
    print fileName
    file = open(fileName)
    tweets=[ujson.loads(l) for l in file]
    tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
    tagged_tweets = tagger.tag(tokenized_tweets)
    for i in range(len(tweets)):
        tweets[i]['tagged_tweet'] = tagged_tweets[i]
    output_file_name = fileName[:-5]+'-tagged.data'
    file = open(output_file_name,'w')
    for tweet in tweets:
        file.write("%s\n"%ujson.dumps(tweet))

コード例 #31

0

ファイルを表示

ファイル: tweets_seg.py プロジェクト: satadisha/Twevent

 def __init__(self, tweet):
     self.tweet = tweet
     self.lookup = LookupService(model='bing-body/apr10/5')
     self.toz = Tokenizer()
     self.tokens = self.tokenize(tweet)
     pass

コード例 #32

0

ファイルを表示

ファイル: DivergenceAnalysis.py プロジェクト: phanigadde/CSRelated

class DataHandler:
  def __init__(self, dataFile, usersData):
    self.__data = []
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSplitIndices = {}
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self.__users = set()
    self.__userWiseIndices = {}
    self.__userWiseTimeSplitIndices = {}
    self._tok = Tokenizer(preserve_case=False)
    self.__userJoins = dd(lambda:-1)
    self.__read(dataFile)
    self.__loadUsersJoins(usersData)
    self.__splitUserWise()
    self.timeHandler = TimeHandler()
    self.__userWiseTimeSplit()
    #self.__commWiseTimeSplit()
    self.sampledUsers = set()
    self.activeForums = {}
    self.activeUsersInForums = dd(set)
    
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded " + str(len(self.__userJoins)) + " users' joins\n")

  def loadActiveForums(self, activeForums):
    for line in open(activeForums):
      line = line.strip().split("\t")
      try:
        self.activeForums[line[0]] = line[1]
        self.activeUsersInForums[line[1]].add(line[0])
      except:
        pass
  
  def __validUserId(self, userId):
    try:
      userId = int(userId)
      assert userId >= 1 and userId <= 45037
      return True
    except:
      return False

  def __splitUserWise(self):
    tempDD = dd(list)
    for index in range(len(self.__data)):
      user = self.__data[index][5]
      if not self.__validUserId(user):
        continue
      tempDD[user].append(index)
    for user in tempDD.iterkeys():
      self.__userWiseIndices[user] = copy.deepcopy(tempDD[user])
    del tempDD

  def __userWiseTimeSplit(self):
    for user in self.__userWiseIndices.iterkeys():
      self.__userWiseTimeSplitIndices[user] = self.divideBasedOnMonths(self.__userWiseIndices[user])
  
  def __commWiseTimeSplit(self):
    for comm in self.__commWiseIndices.iterkeys():
      self.__commWiseTimeSplitIndices[comm] = self.divideBasedOnMonths(self.__commWiseIndices[comm])
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        succ = self.__updateVocab(record)
        if succ:
          self.__data.append(tuple(record))
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index += 1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk") >= 0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read " + str(index) + " records\n")
    sys.stderr.write("Word types " + str(len(self.__vocab)) + "\n")
    sys.stderr.write("Users: " + str(len(self.__users)) + "\n")
    
  def _tokenize(self, text):
    text = text.strip()
    text = re.sub('[\s\n]+', ' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    if len(record)!=7:
      return
    comm = record[3]
    if comm.find('Talk') < 0:
      return 0
    text = record[1]
    if text.find("http") >= 0 or text.find("<blockquote>") >= 0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def preprocessVocab(self, stopWords):
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >= 5 and self.__vocabDocCount[word] >= 50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types " + str(len(self.__backGround)) + "\n")

  def getAllUsers(self):
    return copy.deepcopy(self.__users)

  def userStats(self, outFile):
    outFile = open(outFile,'w')
    for user in self.__userWiseIndices.iterkeys():
      userDataIndices = self.__userWiseIndices[user]
      timeDividedUserIndices = self.divideBasedOnMonths(userDataIndices)
      outFile.write('\t'.join(map(lambda x:str(x), [user, len(timeDividedUserIndices)]))+'\n')
    outFile.close()

  def getUserDataIndices(self, user):
    userDataIndices = []
    for index in range(len(self.__data)):
      userDataIndices.append(index)
    return copy.deepcopy(userDataIndices)
  
  def divideBasedOnMonths(self, data):
    timeDividedIndices = dd(list)
    for index in data:
      timeDiff = self.__timeDiff(index)
      if timeDiff >= 0:
        timeDividedIndices[timeDiff].append(index)
      #else:
      #  print timeDiff
    return copy.deepcopy(timeDividedIndices)
    
  def __timeDiff(self, recordIndex):
    #try:
      #print recordIndex
      record = self.__data[recordIndex]
      postTime = str(record[4])
      user = str(record[5])
      userJoin = self.__userJoins[user]
      return self.timeHandler.diffMonths(postTime, userJoin)
    #except:
    #  return -1
  
  def makeDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for text in data: ## I just expect an array of texts, not the entire records
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2) ## Log transformation!!
    #assert self.isValid(dist)
    return dist

  def isValid(self, dist):
    sumProb = 0
    for x in dist.iterkeys():
      sumProb += dist[x]
    print sumProb
    return True

  def sampleUsers(self):
    US = userSampling(self.__userWiseTimeSplitIndices)
    self.sampledUsers = US.finalizeUsers()
    self.__userWiseTimeSplitIndices = copy.deepcopy(US.userWiseTimeSplitIndices)
    return copy.deepcopy(self.sampledUsers)

  def getUserMonths(self, user):
    months = copy.deepcopy(self.__userWiseTimeSplitIndices[user].keys())
    for i in range(1,4):
      try:
        months.remove(i)
      except:
        pass
    for i in range(25,31):
      try:
        months.remove(i)
      except:
        pass
    return months

  def getUserDataForDivergence(self, user, month):
    return [copy.deepcopy(self.__data[index][1]) for index in self.__userWiseTimeSplitIndices[user][month]]

  def getUserInitialData(self, user):
    data = []
    for month in range(1,4):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getUserMaturedData(self, user):
    data = []
    for month in range(25,31):
      try:
        for index in self.__userWiseTimeSplitIndices[user][month]:
          data.append(self.__data[index][1])
      except:
        pass
    return data

  def getActiveForum(self, userNum):
    return self.activeForums[userNum]

  def getForumInitialData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(1,4):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

  def getForumMaturedData(self, comm):
    #assert comm in self.__commWiseIndices
    data = []
    #for user in self.__users:
    for user in self.activeUsersInForums[comm]:
      for month in range(25,31):
        try:
          for index in self.__userWiseTimeSplitIndices[user][month]:
            data.append(self.__data[index][1])
        except:
          pass
    return data

コード例 #33

0

ファイルを表示

ファイル: SamplePosts.py プロジェクト: phanigadde/CSRelated

class DataSampler():
  def __init__(self, dataFile):
    self.__data = []
    self.__commWiseIndices = {}
    self.__commWiseSampleIndices = {}
    self.__commWiseSampleWordFreq = dd(lambda:dd(int))
    self.__read(dataFile)
    self._tok = Tokenizer(preserve_case=False)
    
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      #print record
      try:
        self.__data.append(tuple(record))
        tempDD[record[3]].append(index)
        index+=1
      except:
        print record
        sys.exit()
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    
  def sanityCheck(self):
    self.__printDictSizes(self.__commWiseIndices)
    self.__printDictSizes(self.__commWiseSampleIndices)
    #print filter(lambda x:x[0]=="74526", self.__data)
    
  def __printDictSizes(self, D):
    for key in D.iterkeys():
      sys.stdout.write(key+"\t"+str(len(D[key]))+"\n")
      
  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def communityWiseSample(self):
    numPosts = 30000
    for key in self.__commWiseIndices.iterkeys():
      self.__commWiseSampleIndices[key] = random.sample(self.__commWiseIndices[key], numPosts)
    sys.stderr.write("Sampled "+str(numPosts*5)+"\n")
      
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __filterWords(self, backGroundVector):
    lexicon = set()
    for word, freq in backGroundVector.iteritems():
      if freq > 9:
        lexicon.add(word)
    return lexicon
  
  def preparePosts(self, outputFile):
    outputFile = open(outputFile,'w')
    backGroundVector = dd(int)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for token, freq in freqVector.iteritems():
          backGroundVector[token] += freq
    
    print "Background words:",len(backGroundVector)
    filteredLexicon = self.__filterWords(backGroundVector)
    print "Filtered Words:",len(filteredLexicon)
    ##sys.exit()
    
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        words = [x+"$:$:"+str(y) for x,y in freqVector.iteritems() if x in filteredLexicon]
        if len(words) > 0:
          outputFile.write(key+'\t'+'  '.join(words)+'\n')
    outputFile.write('background'+'\t'+'  '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n')
    outputFile.close()

  
  def analyzeLexicon(self, lexicon, background):
    words = [(w,f) for w,f in background.iteritems() if w in lexicon]
    words = sorted(words,cmp=lambda x,y:y[1]-x[1])
    index = 0
    while 1:
      print words[index]
      dummy = raw_input()
      index += 1
  
  def preparePostsSingleDoc(self, outputFile):
    outputFile = open(outputFile,'w')
    backGroundVector = dd(int)
    for key in self.__commWiseSampleIndices.iterkeys():
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for token, freq in freqVector.iteritems():
          backGroundVector[token] += freq
    
    print "Background words:",len(backGroundVector)
    filteredLexicon = self.__filterWords(backGroundVector)
    print "Filtered Words:",len(filteredLexicon)
    ##self.analyzeLexicon(filteredLexicon, backGroundVector)
    ##sys.exit()
    
    for key in self.__commWiseSampleIndices.iterkeys():
      globalFreqVector = dd(int)
      for index in self.__commWiseSampleIndices[key]:
        tokens = self._tokenize(self.__data[index][1])
        freqVector = self.freqVector(tokens)
        for word, freq in freqVector:
          globalFreqVector[word] += freq 
      words = [x+"$:$:"+str(y) for x,y in globalFreqVector.iteritems() if x in filteredLexicon]
      if len(words) > 0:
        outputFile.write(key+'\t'+'  '.join(words)+'\n')
    outputFile.write('background'+'\t'+'  '.join([x+"$:$:"+str(y) for x,y in backGroundVector.iteritems() if x in filteredLexicon])+'\n')
    outputFile.close()

コード例 #34

0

ファイルを表示

ファイル: RegressionAnalysis.py プロジェクト: phanigadde/CSRelated

class TimeDividedData:
  def __init__(self, dataFile, usersData):
    sys.stderr.write('In Constructor\n')
    self.__data = []
    self.__userJoins = dd(lambda:-1)
    self.__vocab = dd(int)
    self.__vocabDocCount = dd(int)
    self.__backGround = {}
    self.__commWiseIndices = {}
    self.__commWiseTimeSeparatedIndices = dd(lambda:dd(list))
    self.__communutyWiseVocab = dd(lambda:dd(int))
    self._tok = Tokenizer(preserve_case=False)
    self.__users = set()
    self.__read(dataFile)
    self.__loadUsersJoins(usersData)
    self.__months = {'January':1, 'February':2, 'March':3,'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
  
  def __loadUsersJoins(self, usersData):
    dataFile = open(usersData)
    for line in dataFile:
      line = line.strip().split('\t')
      self.__userJoins[line[0]] = line[1] ## Correct the indices
    sys.stderr.write("Loaded "+str(len(self.__userJoins))+" users' joins\n")

  def _tokenize(self, text):
    text =  text.strip()
    text = re.sub('[\s\n]+',' ', text)
    return self._tok.tokenize(text)
  
  def freqVector(self, tokens):
    tempFreqVector = dd(int)
    for token in tokens:
      tempFreqVector[token] += 1
    return tempFreqVector
  
  def __updateVocab(self, record):
    comm = record[3]
    if comm.find('Talk')<0:
      return 0
    text = record[1]
    if text.find("http")>=0 or text.find("<blockquote>")>=0:
      return 0
    tokenDict = self.freqVector(self._tokenize(text))
    for word, freq in tokenDict.iteritems():
      self.__vocab[word] += freq
      self.__communutyWiseVocab[comm][word] += freq
      self.__vocabDocCount[word] += 1 
    return 1
    ##print self.__vocab
  
  def __read(self, dataFile):
    dataFile = open(dataFile)
    dataFile.readline()
    csvReader = csv.reader(dataFile, quotechar='"', escapechar="\\")
    index = 0
    tempDD = dd(list)
    for record in csvReader:
      try:
        self.__data.append(tuple(record))
        succ = self.__updateVocab(record)
        if succ:
          tempDD[record[3]].append(index)
          tempDD['AllTalk'].append(index)
          self.__users.add(record[5])
        index+=1
      except:
        pass
    for key, value in tempDD.iteritems():
      if key.find("Talk")>=0:
        self.__commWiseIndices[key] = value
    sys.stderr.write("Read "+str(index)+" records\n")
    sys.stderr.write("Word types "+str(len(self.__vocab))+"\n")
    sys.stderr.write("Users: "+str(len(self.__users))+"\n")
  
  def preprocessVocab(self, stopWords):
    stopWords = [w.strip() for w in open(stopWords)]
    self.__backGround = {}
    totalVocab = self.__vocab.keys()
    for word in totalVocab:
      freq = self.__vocab[word]
      if freq >=5 and self.__vocabDocCount[word]>=50 and word not in stopWords:
        self.__backGround[word] = freq
      else:
        del self.__vocab[word]
    for comm in self.__communutyWiseVocab.iterkeys():
      commVocab = self.__communutyWiseVocab[comm].keys()
      for word in commVocab:
        if word in self.__vocab:
          continue
        del self.__communutyWiseVocab[comm][word]
    sys.stderr.write("Filtered Word types "+str(len(self.__backGround))+"\n")
  
  def __timeDiff(self, record):
    postTime = str(record[4])
    user = str(record[5])
    userJoin = self.__userJoins[user]
    return self.__diff(postTime, userJoin)
  
  def test(self):
    sampleTime = "November 17th, 2007, 4:21 pm"
    sampleTime2 = "October 11th, 2006, 3:15 am"
    print self.__diff(sampleTime, sampleTime2)
    
  def __diff(self, time1, time2):
    #print time1, time2
    year1 = int(time1.split(',')[1].strip())
    year2 = int(time2.split(',')[1].strip())
    diff = 0
    if year1 > year2:
      temp = time1
      time1 = time2
      time2 = temp
      diff = (year1 - year2 - 1)*12
    elif year1 < year2:
      diff = (year2 - year1 - 1)*12
    month1 = time1.split(' ')[0]
    month2 = time2.split(' ')[0]
    diff += 12 - self.__months[month1] + self.__months[month2]
    #if diff < 0:
    #  print year1, year2, time1, time2
    return diff
  
  def numUsers(self, comm, time):
    usersSet = set()
    for index in self.__commWiseTimeSeparatedIndices[comm][time]:
      user =  self.__data[index][5]
      usersSet.add(user)
    return len(usersSet)
    
  
  def divideBasedOnTimes(self):
    for comm in self.__commWiseIndices.iterkeys():
      for index in self.__commWiseIndices[comm]:
        timeDiff = self.__timeDiff(self.__data[index])
        self.__commWiseTimeSeparatedIndices[comm][timeDiff].append(index)
    return
    for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
      for time in self.__commWiseTimeSeparatedIndices[comm].iterkeys():
        if time < 25:
          print comm, time, len(self.__commWiseTimeSeparatedIndices[comm][time]), self.numUsers(comm, time)
  
  def __wordDist(self, data):
    totalWords = 0
    dist = dd(lambda:1)
    for record in data:
      #print record
      ##record = self.__data[record] ## Change this based on analysis.. Bad code!!
      text = record[1]
      #print text
      tokenDict = self.freqVector(self._tokenize(text))
      for word, freq in tokenDict.iteritems():
        if word in self.__vocab:
          dist[word] += freq
          totalWords += freq
    for word in self.__vocab:
      dist[word] += 0
    totalWords += len(self.__vocab)
    for word in self.__vocab:
      dist[word] /= float(totalWords)
      ##dist[word] = round(-1*self.myLog(dist[word]),2)
    #print dist
    return dist

  def splitUserWise(self, data):
    userWise = dd(list)
    for record in data:
      userWise[record[5]].append(record)
    return userWise

  def KLDAnalysis(self, comm):
    print comm
    userWiseKLD = dd(lambda:dd(int))
    data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][1]]
    userWiseData = self.splitUserWise(data)
    #sampledData = random.sample(data, 1000)
    #m1Dist = self.__wordDist(sampledData)
    ##data25 = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][25]]
    ##m25Dist = self.__wordDist(data25)
    #dataSecondYear = []
    #for 
    for time in range(2,25):
      data = [self.__data[index] for index in self.__commWiseTimeSeparatedIndices[comm][time]]
      userWiseMonthData = self.splitUserWise(data)
      #sampledData = random.sample(data, 1000)
      for user in userWiseMonthData.iterkeys():
        userDist =  self.__wordDist(userWiseMonthData[user])
      #monthDist = self.__wordDist(sampledData)
      kld = self.KLD(m1Dist, monthDist)
      #kldWith25 = self.KLD(monthDist, m25Dist)
      #sys.stdout.write(str(time)+'\t'+str(kld)+'\t'+str(kldWith25)+'\n')
      sys.stdout.write(str(time)+'\t'+str(kld)+'\n')

  def myLog(self, x):
    #try:
      return math.log(x)
    #except ValueError:
    #  return -100000
    #except ZeroDivisionError:
    #  return 100000

  def KLD(self, P, Q):
    kld = 0
    for word in P.iterkeys():
      p = P[word]
      pbyq = P[word]/Q[word]
      kld += p*self.myLog(pbyq)
    return kld

  def KLDivergenceAnalysis(self):
    ##for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
    self.KLDAnalysis('AllTalk')

  '''def regress(self):
    for comm in self.__commWiseTimeSeparatedIndices.iterkeys():
      instances = dd(list)
      #instances = []
      users = set()
      for time in range(1,25):
        for index in self.__commWiseTimeSeparatedIndices[comm][time]:
          record = self.__data[index]
          user = str(record[5])
          users.add(user)
      #print len(users)
      users = set(random.sample(list(users),min(len(users),1500)))
      for time in range(1,25):
        for index in self.__commWiseTimeSeparatedIndices[comm][time]:
          record = self.__data[index]
          user = str(record[5])
          if user not in users:
            continue
          instances[user+'_'+str(time)].append(index)
          #instances.append((index,time))
      #instances = random.sample(instances, 1000)
      regInstances = self.createRegInstances(instances)
      print comm, len(regInstances)
      model = creg.LinearRegression()
      model.fit(creg.RealvaluedDataset(regInstances), l1=0.1)
      outFile = open("weights_"+comm.strip().replace(' ',''),"w")
      weights = sorted([(W,w) for W,w in model.weights],cmp=myCMP)
      for weight in weights:
        outFile.write(weight[0]+'\t'+str(weight[1])+'\n')
      outFile.close()
      del regInstances
      del instances'''

  def createRegInstances(self, instances):
    regInstances = []
    for userTime in instances.iterkeys():
      dataIndices = instances[userTime]
      #data = [self.__data[index] for index in dataIndices]
      time = int(userTime.split('_')[1])
      wordDist = self.__wordDist(dataIndices)
      #print wordDist
      regInstances.append((wordDist,-1*self.myLog(time)))
      #print len(regInstances)
    return regInstances