def removeRetweetsAbsolute():
    l = [ ]

    for doc in SimpleDoc.getall():
        if len(doc.tweets) >= 1:
            numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet())
            numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet())

            l.append([numNormalTweets, numRetweets])

    diffs = map(lambda x: x[0]-x[1], l)

    """relBins, labels = pieData([
        [lambda x: x==0, "diff: 0"], 
        [lambda x: x==1, "diff: 1"],
        [lambda x: x>=2 and x <= 5, "diff: 2-5"],
        [lambda x: x>5, "diff: >5"]
    ], diffs)"""

    relBins, labels = pieData([
        [lambda x: x>=6 and x <=10, "diff: 6-10"],
        [lambda x: x>=11 and x <=50, "diff: 11-50"],
        [lambda x: x>51, "diff: >51"]
    ], diffs)

    plt.figure()
    plt.pie(relBins, autopct='%1.1f%%', 
        startangle=90, labels=labels)

    plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn Retweets entfernt werden.")
    plt.show()
def removeRetweetsRelative():
    l = [ ]

    for doc in SimpleDoc.getall():
        if len(doc.tweets) >= 1:
            numNormalTweets = sum(1 for tweet in doc.tweets if not tweet.isRetweet())
            numRetweets = sum(1 for tweet in doc.tweets if tweet.isRetweet())

            l.append([numNormalTweets+numRetweets, numNormalTweets])

    diffs = map(lambda x: float(x[0]-x[1])/x[0], l)

    relBins, labels = pieData([
        [lambda x: x==0.0, "0%"], 
        [lambda x: x>0.0 and x<=0.3, "0%<d<=30%"],
        [lambda x: x>0.3 and x<=0.5, "30%<d<=50%"],
        [lambda x: x>0.5, ">50%"]
    ], diffs)

    plt.figure()
    plt.pie(relBins, autopct='%1.1f%%', 
        startangle=90, labels=labels)

    plt.title("Differenzen in Prozent, Retweets entfernt werden")
    plt.show()
def publication_years():
    plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k')
    publicationYears = list(simpleDoc.publicationDatetime().year for simpleDoc in SimpleDoc.getallBetween(None, None))
    histDisc(plt, publicationYears, width = 0.5)
    # plt.savefig(figurePath("publication_years.png"))
    plt.tight_layout()
    plt.show()
def tweetVsMendeleyReaders(yearBounds = [None, None], maxTweets = 300, maxReaders = 300):
    tweetVsMendeleyReaderList = []

    totalDocs = 0
    for doc in filter(lambda doc: 
        (doc.mendeleyReaders != None and doc.mendeleyReaders<=maxReaders) and 
            (doc.tweets != None and len(doc.tweets)<=maxTweets) and
            (not yearBounds[0] or doc.publicationDatetime().year>=yearBounds[0]) and
            (not yearBounds[1] or doc.publicationDatetime().year<=yearBounds[1]), 
        SimpleDoc.getall()
    ):
        tweetVsMendeleyReaderList.append([len(doc.tweets), doc.mendeleyReaders])
        totalDocs += 1

    x, y = zip(*tweetVsMendeleyReaderList)
    

    plt.figure()

    plt.scatter(x, y)
    plt.title("Korrelation zwischen Tweets und Zitationen (Papieren zwischen " + str(yearBounds[0]) + " und " + str(yearBounds[1]) + "; #Docs: " + str(totalDocs) + ")")
    plt.ylabel("#Tweets (1-" + str(maxTweets) + ")")
    plt.xlabel("#Readers (1-" + str(maxReaders) + ")")

    p = numpy.polyfit(x, y, 1)
    xTrend = range(min(x), max(x)+1)
    yTrend = map(lambda x: numpy.polyval(p, x), xTrend)
    plt.plot(xTrend, yTrend, color='r')

    plt.figtext(0.80, 0.05,  'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y)))

    plt.show()
def numTweets():
    plt.figure()
    
    numTweets = list(len(simpleDoc.tweets) for simpleDoc in SimpleDoc.getall())
    labels, values = hist(numTweets, [1, 2, 5, 10, 20, 50, 100, 500, 1000])
    barPlot(plt, labels, values)
    plt.show()
def removeDoubleUsersAbsolute():
    l = [ ]

    for doc in SimpleDoc.getall():
        tweetUsers = map(lambda x: x.user, doc.tweets)
        if len(tweetUsers) >= 1:
            l.append([len(tweetUsers), len(set(tweetUsers))])

    diffs = map(lambda x: x[0]-x[1], l)

    """relBins, labels = pieData([
        [lambda x: x==0, "diff: 0"], 
        [lambda x: x==1, "diff: 1"],
        [lambda x: x>=2 and x <= 5, "diff: 2-5"],
        [lambda x: x>5, "diff: >5"]
    ], diffs)"""

    relBins, labels = pieData([
        [lambda x: x>=6 and x<=10, "diff: 5-10"],
        [lambda x: x>=11 and x<=50, "diff: 11-50"],
        [lambda x: x>=51 and x<=300, "diff: 51-300"]
    ], diffs)

    plt.figure()
    plt.pie(relBins, autopct='%1.1f%%', 
        startangle=90, labels=labels)

    plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.")
    plt.show()
def mendeleyDisciplines():
    def generalCondition(doc):
        time = doc.publicationDatetime()
        return ( 
                (time.year == 2012 and time.month >= 6 and time.month <= 8) or 
                (time.year > 2012) 
            ) and doc.mendeleyDisciplines != None

    def domainDocs(domain):
        return [doc for doc in consideredDocs if generalCondition(doc) and domain in doc.mendeleyDisciplines]

    consideredDocs = list(filter(lambda doc: generalCondition(doc), SimpleDoc.getall()))
    totalDocs = len(consideredDocs)

    distinctDomains = set()
    for doc in consideredDocs:
        distinctDomains |= set(doc.mendeleyDisciplines)

    domainData = []
    for domain in distinctDomains:
        d = domainDocs(domain)
        numDocs = len(d)
        meanTweets = numpy.mean([doc.numTweets() for doc in d])

        domainData.append((domain, numDocs, ("%2.2f" % (float(numDocs)*100 / totalDocs)) + "\\%", "%2.2f" % meanTweets ))

    domainDataSorted = sorted(domainData, key=lambda x: x[1], reverse=True)

    compileTex(
        simpleTabular(["Disziplin", "\\#Dokumente", "Anteil", "AvgTweets"], domainDataSorted, orientation="lrrr"),
        figurePath("mendeleyDisciplines2.pdf")
    )
def twitterHist():
    plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k')

    tweetTime = [(tweet.datetime().year, tweet.datetime().month) for doc in SimpleDoc.getall() for tweet in doc.tweets]
    histDisc(plt, tweetTime, width=0.5)
    #plt.title("Verteilung der Tweets nach Jahr und Monat")
    plt.tight_layout()
    plt.show()
def crossrefVsTwitter(yearBounds = [None, None], minTweetAge = None, maxTweetAge = None):
    tweetVsCrossrefList = []
    minTweetAge = 60*60*24*0
    maxTweetAge = 60*60*24*100

    totalDocs = 0
    totalTweets = 0
    nullWeights = 0
    nonNullWeights = 0
    for doc in filter(
        lambda doc: 
            (doc.publicationDatetime().year==2012 and 
            doc.publicationDatetime().month>=6 and doc.publicationDatetime().month<=8),
        SimpleDoc.getall()
    ):
        docsTweets = filter(lambda tweet: 
            (not minTweetAge or (tweet.timestamp-doc.publicationTimestamp) >= minTweetAge) and
            (not maxTweetAge or (tweet.timestamp-doc.publicationTimestamp) <= maxTweetAge), 
            doc.tweets)
        
        def userWeight(tweet):
            user = tweet.user()
            return None if user is None else user.weight()

        userWeights = map(lambda tweet: userWeight(tweet), docsTweets)

        nullWeights += sum((1 for weight in userWeights if weight is None))
        nonNullWeights += sum((1 for weight in userWeights if not weight is None))

        tweetVsCrossrefList.append([doc.numCrossrefs(), 0 if len(userWeights) == 0 else sum(filter(lambda weight: weight != None, userWeights))])
        totalDocs += 1
        totalTweets += len(docsTweets)

    print totalDocs
    print totalTweets
    print float(nullWeights) / (nullWeights+nonNullWeights)

    # tweetVsCrossrefList = sorted(tweetVsCrossrefList, key=lambda tc: tc[1], reverse=True)[:100]
    x, y = zip(*tweetVsCrossrefList)
    paperFigure(plt)
    plt.scatter(x, y)
    # plt.title("Korrelation zwischen Tweets und Zitationen (Papieren zwischen " + str(yearBounds[0]) + " und " + str(yearBounds[1]) + "; #Docs: " + str(totalDocs) + ")")
    plt.ylabel("#Tweets")
    plt.xlabel("#Crossrefs")
    #plt.xlim((0,200))
    #plt.ylim((0,30))

    p = numpy.polyfit(x, y, 1)
    xTrend = range(min(x), max(x)+1)
    yTrend = map(lambda x: numpy.polyval(p, x), xTrend)
    plt.plot(xTrend, yTrend, color='r')

    # plt.figtext(0.80, 0.05,  'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y)))
    print 'korrelationskoeffizient: ' + str(korrelationskoeffizient(x, y))
    plt.tight_layout()
    plt.show()
def groupByJournalAndVolume():
    issns = { }
    docs = list(SimpleDoc.getall())
    for doc in docs:
        issns[doc.issn] = issns.get(doc.issn, 0) + 1

    validIssns = map(lambda kv: kv[0], filter(lambda item: item[1]>5 and item[0] != None, issns.items()))

    groups = { }
    for doc in docs:
        if doc.issn in validIssns:
            groupList = groups.get((doc.issn, doc.volume), [])
            # groupList = groups.get(doc.issn, [])
            groupList.append(doc)
            groups[(doc.issn, doc.volume)] = groupList
            # groups[doc.issn] = groupList

    validGroups = filter(lambda group: len(group[1]) > 5, groups.items())
    # validGroups = groups.items()
    
    correlationValues = []    
    for ident, docs in validGroups:
        docTweets = map(lambda doc: doc.numTweets(), docs)
        docCrossrefs = map(lambda doc: doc.numCrossrefs(), docs)
        korr = None

        # docTweetCrossrefRatios = map(lambda doc: [float(doc.numTweets()) / doc.numCrossrefs() if doc.numCrossrefs() != 0 else float('nan')], docs)

        maxYear = max(map(lambda doc: doc.publicationDatetime().year, docs))
        minYear = min(map(lambda doc: doc.publicationDatetime().year, docs))

        yearRange = None
        if maxYear == minYear:
            yearRange = str(minYear)
        else:
            yearRange = str(minYear) + "-" + str(maxYear)

        try:
            korr = "%2.3f" % korrelationskoeffizient(docTweets, docCrossrefs)
        except ZeroDivisionError:
            korr = "NaN"
            
        # correlationValues.append([ident[0], ident[1], len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.std(docTweets), korr, yearRange])
        correlationValues.append([ident[0], ident[1], len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.mean(docCrossrefs), "%2.2f" % (float(numpy.sum(docTweets))/numpy.sum(docCrossrefs)),  yearRange])
        # correlationValues.append([ident, len(docs), "%2.2f" % numpy.mean(docTweets), "%2.2f" % numpy.std(docTweets), korr])

    correlationValues = sorted(correlationValues, key=lambda x: x[0])

    compileTex(
        # simpleTabular(["ISSN", "Volume", "\\#Docs", "AVG Tweets", "StdDev", "korr", "Years"], correlationValues, orientation="llrrrrl"),
        simpleTabular(["ISSN", "Volume", "\\#Docs", "AVG T", "AVG C", "T/C", "Years"], correlationValues, orientation="llrrrrl"),
        # simpleTabular(["ISSN", "\\#Docs", "AVG Tweets", "StdDev", "korr", "Years"], correlationValues, orientation="lrrrrl"),
        figurePath("correlationsInJournals2.pdf")
    )
def overviewData():
    # earliest tweet timestamp: 1337079632
    docs = list(SimpleDoc.getall())
    tweets = [ tweet for doc in docs for tweet in doc.tweets ]

    print "total Tweets: " + str(len(tweets))
    print "latest tweet: " + str(max(tweets, key=lambda x: x.timestamp).datetime())
    print "earliest tweet: " + str(min(tweets, key=lambda x: x.timestamp).datetime())
    
    print "total Documents: " + str(len(docs))
    print "latest document: " + str(max(docs, key=lambda doc: doc.publicationTimestamp).publicationDatetime())
    print "earliest document: " + str(min(docs, key=lambda doc: doc.publicationTimestamp).publicationDatetime())
def tweetsBySpecificUserCorrelations():
    docs = SimpleDoc.getallBetween((2012, 6), (2012, 8))

    pairs = []
    for doc in docs:
        numTweets = len(filter(lambda tweet: tweet.username=="ATP_CME" ,doc.tweets))
        citations = doc.averageCitations()
        pairs.append([numTweets, citations])

    x, y = zip(*pairs)

    print allCorrelations(x, y)
    plt.scatter(x, y)
    plt.show()
def userHist():
    plt.figure()

    users = []
    for doc in SimpleDoc.getall():
        for tweet in doc.tweets:
            users.append(tweet.user)

    userGroupCounts = sorted(groupCount(users), key=lambda x: x[1], reverse=True)
    filteredUserGroupCounts = filter(lambda x: x[1]>=2, userGroupCounts)

    plt.plot(map(lambda x: x[1], filteredUserGroupCounts), range(1, len(filteredUserGroupCounts)+1))
    plt.title("Users sortiert nach Tweets (1er User abgeschnitten)")
    plt.xlabel("Rang des Users")
    plt.ylabel("#Tweets")
    plt.show()
def tweetHist():
    docs = list(filter(
        lambda doc: 
            (doc.publicationDatetime().year==2012 and 
            doc.publicationDatetime().month>=6 and doc.publicationDatetime().month<=8),
        SimpleDoc.getall()
    ))

    numTweets = [doc.numTweets() for doc in docs]
    plt.figure()
    plt.hist(numTweets, bins=xrange(0, 150, 5))
    
    plt.figure()
    numCite = [doc.numCrossrefs() for doc in docs]
    plt.hist(numCite, bins=xrange(0, 150, 5))
    plt.show()
def topUsers():
    plt.figure()

    users = []
    for doc in SimpleDoc.getall():
        for tweet in doc.tweets:
            users.append(tweet.user)

    userGroupCounts = sorted(groupCount(users), key=lambda x: x[1], reverse=True)
    topUsers = userGroupCounts[:10]

    users, values = zip(*topUsers)

    barPlot(plt, list(users), list(values))
    plt.title("Top 10 Users")
    plt.ylabel("#Tweets")
    plt.show()
def numRetweets():
    numNormalTweets = 0
    numRetweets = 0

    for doc in SimpleDoc.getall():
        for tweet in doc.tweets:
            if(tweet.isRetweet()):
                numRetweets += 1
            else:
                numNormalTweets += 1

    relNormalTweets, relRetweets = float(numNormalTweets)*100 / (numNormalTweets+numRetweets), float(numRetweets)*100 / (numNormalTweets+numRetweets)

    plt.figure()
    plt.pie([relNormalTweets, relRetweets], autopct='%1.1f%%', 
        startangle=90, labels=[
            'normale Tweets (' + str(numNormalTweets) + ')', 
            'Retweets (' + str(numRetweets) + ')'
        ], colors=['green', 'yellow'])

    plt.show()
def removeDoubleUsersRelative():
    l = [ ]

    for doc in SimpleDoc.getall():
        tweetUsers = map(lambda x: x.user, doc.tweets)
        if len(tweetUsers) >= 1:
            l.append([len(tweetUsers), len(set(tweetUsers))])

    diffs = map(lambda x: float(x[0]-x[1])/x[0], l)

    relBins, labels = pieData([
        [lambda x: x==0.0, "0%"], 
        [lambda x: x>0.0 and x<=10.0, "0%<d<=10%"],
        [lambda x: x>0.1 and x<=0.3, "10%<d<=30%"],
        [lambda x: x>0.3, ">30%"]
    ], diffs)

    plt.figure()
    plt.pie(relBins, autopct='%1.1f%%', 
        startangle=90, labels=labels)

    plt.title("Differenzen in Anzahl Tweets zu Dokument, wenn doppelte Benutzer entfernt werden.")
    plt.show()
def alteringTweetStreamAfterFirstPeak():
    relativeTweetDiffAfter1WeekAndTotal = map(
        lambda doc: 
            float(doc.numTweets()) / doc.numTweetsBetweenRelative(None, 60*60*24*7),
            filter(lambda doc: doc.numTweetsBetweenRelative(None, 60*60*24*7) >= 5, SimpleDoc.getall())
    )

    relBins, labels = pieData([
        [lambda x: x==1.0, "+0%"], 
        [lambda x: x>1.0 and x<=1.1, "+0-10%"],
        [lambda x: x>1.1 and x<=1.2, "+10-20%"],
        [lambda x: x>1.2 and x<=1.3, "+20-30%"],
        [lambda x: x>1.3 and x<=1.4, "+30-40%"],
        [lambda x: x>1.4 and x<=1.5, "+40-50%"],
        [lambda x: x>1.5, ">50%"]
    ], relativeTweetDiffAfter1WeekAndTotal)

    plt.figure()
    plt.pie(relBins, autopct='%1.1f%%', 
        startangle=90, labels=labels)

    plt.title("Relative Anzahl Tweets im Vergleich zu Anzahl Tweets nach einer Woche")
    plt.show()
def correlationsForQuartals():
    quartals = [
        [3, 2008],
        [0, 2009], [1, 2009], [2, 2009], [3, 2009],
        [0, 2010], [1, 2010], [2, 2010], [3, 2010],
        [0, 2011], [1, 2011], [2, 2011], [3, 2011],
        [0, 2012], [1, 2012], [2, 2012], [3, 2012],
        [0, 2013], [1, 2013]
    ]

    def docInQuartal(doc, quartal):
        if quartal[1] != doc.publicationDatetime().year:
            return False
        elif quartal[0] == 0:
            return doc.publicationDatetime().month >=1 and doc.publicationDatetime().month<=3
        elif quartal[0] == 1:
            return doc.publicationDatetime().month >=4 and doc.publicationDatetime().month<=6
        elif quartal[0] == 2:
            return doc.publicationDatetime().month >=7 and doc.publicationDatetime().month<=9
        elif quartal[0] == 3:
            return doc.publicationDatetime().month >=10 and doc.publicationDatetime().month<=12
        else:
            raise ValueError("Argument quartal consists of a tuple [quartal, year] where quartal must be between 0 and 3")

    allDocs = list(SimpleDoc.getall())
    coefficients = []
    for quartal in quartals:
        docs = filter(lambda doc: docInQuartal(doc, quartal) and doc.mendeleyReaders != None, allDocs)
        print len(docs)
        x, y = zip(*map(lambda doc: [len(doc.tweets), doc.crossrefTimeline[0].totalCrossrefs], docs))

        coefficients.append(korrelationskoeffizient(x, y))

    plt.figure()
    plt.plot(range(0, len(quartals)), coefficients)
    plt.show()
# expertCategories = ['Medicine', 'Health' ]

wordExperts = getWordExperts(expertWords)
# patrickExperts = getPatrickExperts(expertCategories)

"""bioDocs = minimizedDocs(
    filter(
        lambda doc: 
            doc.mendeleyDisciplines != None and 'Biological Sciences' in doc.mendeleyDisciplines, 
            SimpleDoc.getallBetween((2012,6), (2012,8))
    ),
    metrics
)"""

docs = minimizedDocs(
    SimpleDoc.getallBetween((2012,6), (2012,8)),
    metrics
)

usersInTimewindow = set((usr for doc in docs for usr in doc[0]))
totalNumTweets = sum((1 for doc in docs for u in doc[0]))

"""f = open("baselines", "w")

for numTweets in range(100, totalNumTweets, 100):
    print str(numTweets) + " / " + str(totalNumTweets)
    baseline = getBaseline(docs, metricNames, numTweets)
    f.write(json.dumps( { "num-tweets" : numTweets, "baseline" : baseline } ) + "\n")
    f.flush()

f.close()"""
def canBeEncoded(text):
    try:
        str(text)
        return True
    except UnicodeEncodeError:
        return False

def tweetsBetweenDay(documents, lowerBound, upperBound):
    return [[tweet.text, tweet.timestamp, tweet.username, doc.doi, doc.title, doc.publicationTimestamp] for doc in documents for tweet in doc.tweets 
        if 
            ((lowerBound*60*60*24) <= (tweet.timestamp - doc.publicationTimestamp) <= (upperBound*60*60*24)) and
            canBeEncoded(tweet.text) and
            canBeEncoded(doc.title)
    ]

relevantDocuments = SimpleDoc.getallBetween((2012, 6), (2012, 8))

tweets = []
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 0, 1), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 1, 3), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 3, 5), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 7, 30), 333))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 100, 300), 333))

tweetTexts = map(lambda tweetdata: "\t".join([str(tweetdata[0]), str(tweetdata[1]), tweetdata[2], tweetdata[3], tweetdata[4], str(tweetdata[5])]), tweets)
random.shuffle(tweetTexts)

f = open("tweetTexts_1.txt", "w")
for text in tweetTexts[0:333]:
    f.write(text.replace("\n", " ").replace("\"", "").replace("'", "") + "\n")
f.close()
import json
from main.util.db import openDb
from scipy import stats
from main.util.common import SimpleDoc, powerset, Log
import math
import itertools

expertTopics = list(map(lambda s: s.strip(), open("data/expert_topics", "r")))

l = Log(filename="foo", verbose=True)

docs = map(lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)], SimpleDoc.getallBetween((2012,6), (2012,8)))

baseline = { }

for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
    for doc in docs:
        numTweets = len(doc[0])
        metricScore = doc[1][ind]
        pairs.append([numTweets, metricScore])

    x, y = zip(*pairs)
    s, p = stats.spearmanr(x, y)

    baseline[metricName] = s

count = 0
count2 = 0
for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
def userCorrelationToDiscipline():
    """
    zuerst user_disc_map erstellen:
    [ user1 : [ 
        [mendDisc1_1, mendDisc1_2, ...], // Liste von Disziplinen pro Tweet des Nutzers
        [mendDisc2_1, mendDisc2_2, ...]
    ], user2: [
        ...
    ] ]
    """
    if not os.path.isfile(dataPath("user_disc_map.json")):
        userDiscList = []

        for doc in SimpleDoc.getall():
            twitterUsers = [tweet.user for tweet in doc.tweets]
            disciplines = doc.mendeleyDisciplines
            if len(twitterUsers)!=0 and disciplines!=None and len(disciplines)!=0:
                for twitterUser in twitterUsers:
                    userDiscList.append([twitterUser, disciplines])
        
        userDiscMap = {}
        for item in userDiscList:
            discList = userDiscMap.get(item[0], [])
            discList.append(item[1])
            userDiscMap[item[0]] = discList

        writeJsonToData(userDiscMap, "user_disc_map.json")
    else:
        userDiscMap = readJsonFromData("user_disc_map.json")


    """
    dann "user_disc_count_map" erstellen:
    [ user1 : { 
        "total_posts" : n,
        "user_posts_in_desc" : {
            "disc1" : n_1,
            "disc2" : n_2, 
            ...
        }
    }, user2: {
        ...
    } ]
    """
    if not os.path.isfile(dataPath("user_disc_count_map.json")):
        userDiscCountMap = { }
        for user, descListList in userDiscMap.items():
            totalPosts = len(descListList)
            allUsersDesc = set()
            for descList in descListList:
                allUsersDesc |= set(descList)

            userPostsInDesc = { }
            for desc in allUsersDesc:
                postsInDesc = sum(1 for descList in descListList if desc in descList)
                userPostsInDesc[desc] = postsInDesc

            userDiscCountMap[user] = { "total_posts" : totalPosts, "user_posts_in_desc" : userPostsInDesc }

        writeJsonToData(userDiscCountMap, "user_disc_count_map.json")
    else:
        userDiscCountMap = readJsonFromData("user_disc_count_map.json")

    for user, userdata in userDiscCountMap.items():
        totalPosts = userdata['total_posts']

        relCounts = []
        for desc, count in userdata['user_posts_in_desc'].items():
            relCounts.append([desc, float(count)/totalPosts])

        relCounts = sorted(relCounts, key=lambda x: x[1], reverse=True)

        if totalPosts > 50:
            print user
            print relCounts
            print "\n\n"
def cummulativeTwitterPlots():
    # twitterTimelines, publicationTimestamps = zip(*filter(lambda timelinePubTs: len(timelinePubTs[0]) != 0, map(lambda doc: [doc.cummulativeTwitterTimeline(), doc.publicationTimestamp], SimpleDoc.getall())))
    twitterTimelines = filter(lambda tl: len(tl) != 0, map(lambda doc: map(lambda point: [point[0]-doc.publicationTimestamp, point[1]], doc.cummulativeTwitterTimeline()), SimpleDoc.getall()))

    # twitterTimelines = filter(lambda tl: len(tl) < 20, twitterTimelines)
    # twitterTimelines = filter(lambda tl: len(tl) > 50, twitterTimelines)
    plt.figure()
    for timeline in twitterTimelines:
        x, y = zip(*timeline)
        plt.plot(x, y)
    
    plt.show()
def correlationTimeTweets():
    x, y = zip(*map(lambda doc: [doc.publicationTimestamp, len(doc.tweets)], SimpleDoc.getall()))
    print korrelationskoeffizient(x, y) # 0.082
def distFirstTweetToDoc():
    allDocs = list(SimpleDoc.getall())

    for param in range(20, 100):
        diffs = []

        maximumTweetAge = 60*60*24*param
        minimumTweetAge = 60*60*24*10
        for doc in filter(lambda doc: len(doc.tweets) != 0 and doc.age() >= maximumTweetAge, allDocs):
            pubTimestamp = doc.publicationTimestamp
            # firstTweetTimestamp = max([tweet.timestamp for tweet in doc.tweets])
            diffs.extend([tweet.timestamp-pubTimestamp for tweet in 
                filter(
                    lambda tweet: (tweet.timestamp-doc.publicationTimestamp) < maximumTweetAge and (tweet.timestamp-doc.publicationTimestamp) > minimumTweetAge,
                    doc.tweets
                )
            ])

        maxBins = 30
        timeslot = (float(maximumTweetAge)-float(minimumTweetAge))/maxBins

        def binNr2Bound(binNr):
            return minimumTweetAge+(binNr*timeslot)

        binConditions = map(
            lambda binNr: [lambda x: x>binNr2Bound(binNr) and x<=binNr2Bound(binNr+1), str(binNr) + "X"],
            range(0, maxBins)
        )

        # binConditions.append([lambda x: x>binNr2Bound(maxBins), ">" + str(maxBins-1) + str("X")])

        diffBins, diffLabels = pieData(binConditions, diffs)
        
        distBinConditions = map(
            lambda binNr: [lambda x: x==binNr, "X=" + str(binNr)],
            range(0, maxBins)
        )

        def getBins(beta, binConditions):
            s = map(lambda x: int(x), numpy.random.exponential(beta, 10000))
            bins, labels = pieData(binConditions, s)
            return bins

        def binDiffs(bins1, bins2):
            return sum(map((lambda (a, b): abs(a-b)), zip(bins1, bins2)))

        def searchInRangeRec(minBeta, maxBeta, steps, depth, maxDepth):
            minError = min(
                map(lambda beta: [beta, binDiffs(getBins(beta, distBinConditions), diffBins)], numpy.arange(minBeta, maxBeta, steps)),
                key = lambda x: x[1]
            )

            errorBelow = binDiffs(getBins(minError[0]-(float(steps)/2), distBinConditions), diffBins)
            errorAbove = binDiffs(getBins(minError[0]+(float(steps)/2), distBinConditions), diffBins)

            if depth==maxDepth:
                return minError
            elif errorBelow <= errorAbove:
                x = searchInRangeRec(minError[0]-steps, minError[0], float(steps)/10, depth+1, maxDepth)
                return x[0], x[1]
            else:
                x = searchInRangeRec(minError[0], minError[0]+steps, float(steps)/10, depth+1, maxDepth)
                return x[0], x[1]

        beta, error = searchInRangeRec(1, 10, 1, 0, 3)

        print param, (error/maxBins)
    # s = numpy.random.poisson(1.2, 10000)
    # s = numpy.random.zipf(1.5, 10000)
    # f = 3.0
    # s = map(lambda x: (float(x)-(1+(random.random()/f)))*f, s)
    # s.extend([0] * (100*60))

    
    #binConditions2.append([lambda x: x>maxBins, ">" + str(maxBins-1) + str("X")])


    """expDistData = map(lambda x: int(x), numpy.random.exponential(beta, 10000))
    (("pmcViews", ()), (2011, 6), "PMC views"), 
    (("maxCitations", ()), (2009, 3), "Citations")
]

attributeNames = map(lambda x: x[0][0], attributeList)
attributePrintNames = map(lambda x: x[2], attributeList)
calls = map(lambda x: x[0], attributeList)
stats = []

for ind, attr in zip(range(0, len(attributeList)), attributeList):
    call = attr[0]
    lowerBound = attr[1]
    attName = attr[0][0]

    valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call),
        SimpleDoc.getallBetween(lowerBound, None)
    ))

    minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric)
    stats.append((attName, call, meanV, std, len(valuesForMetric)))
    print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std]))


statValues = []
for stat in stats:
    name = stat[0]
    call = stat[1]
    mean = stat[2]
    std = stat[3]
    numValues = stat[4]
    valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call),
        SimpleDoc.getallBetween(lowerBound, None)
    ))

    minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric)
    print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std]))
"""

cat = "Biological Sciences"
"""consideredDocs = filter(
        lambda doc: 
            doc.mendeleyDisciplines != None and cat in doc.mendeleyDisciplines, 
            SimpleDoc.getallBetween((2012,6), (2012,8))
    )"""
consideredDocs = SimpleDoc.getallBetween((2012,6), (2012,8))
print len(consideredDocs)
matrix = getAttributeValueMatrix(consideredDocs, calls)
corrs = correlationBetweenEverything(matrix, attributeNames)

"""f = open("foo", "w")
for corr in corrs:
    f.write(corr.toJson() + "\n")
f.close()"""

# corrs = CorrelationItem.fromFile("stuff/pairwise_corr_2012-6_2012-8.json")

f = open("foo", "w")
m = []
for a1 in attributeNames:
    row = []