def publication_years():
    plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k')
    publicationYears = list(simpleDoc.publicationDatetime().year for simpleDoc in SimpleDoc.getallBetween(None, None))
    histDisc(plt, publicationYears, width = 0.5)
    # plt.savefig(figurePath("publication_years.png"))
    plt.tight_layout()
    plt.show()
def tweetsBySpecificUserCorrelations():
    docs = SimpleDoc.getallBetween((2012, 6), (2012, 8))

    pairs = []
    for doc in docs:
        numTweets = len(filter(lambda tweet: tweet.username=="ATP_CME" ,doc.tweets))
        citations = doc.averageCitations()
        pairs.append([numTweets, citations])

    x, y = zip(*pairs)

    print allCorrelations(x, y)
    plt.scatter(x, y)
    plt.show()
    (("pmcViews", ()), (2011, 6), "PMC views"), 
    (("maxCitations", ()), (2009, 3), "Citations")
]

attributeNames = map(lambda x: x[0][0], attributeList)
attributePrintNames = map(lambda x: x[2], attributeList)
calls = map(lambda x: x[0], attributeList)
stats = []

for ind, attr in zip(range(0, len(attributeList)), attributeList):
    call = attr[0]
    lowerBound = attr[1]
    attName = attr[0][0]

    valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call),
        SimpleDoc.getallBetween(lowerBound, None)
    ))

    minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric)
    stats.append((attName, call, meanV, std, len(valuesForMetric)))
    print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std]))


statValues = []
for stat in stats:
    name = stat[0]
    call = stat[1]
    mean = stat[2]
    std = stat[3]
    numValues = stat[4]
# expertCategories = ['Medicine', 'Health' ]

wordExperts = getWordExperts(expertWords)
# patrickExperts = getPatrickExperts(expertCategories)

"""bioDocs = minimizedDocs(
    filter(
        lambda doc: 
            doc.mendeleyDisciplines != None and 'Biological Sciences' in doc.mendeleyDisciplines, 
            SimpleDoc.getallBetween((2012,6), (2012,8))
    ),
    metrics
)"""

docs = minimizedDocs(
    SimpleDoc.getallBetween((2012,6), (2012,8)),
    metrics
)

usersInTimewindow = set((usr for doc in docs for usr in doc[0]))
totalNumTweets = sum((1 for doc in docs for u in doc[0]))

"""f = open("baselines", "w")

for numTweets in range(100, totalNumTweets, 100):
    print str(numTweets) + " / " + str(totalNumTweets)
    baseline = getBaseline(docs, metricNames, numTweets)
    f.write(json.dumps( { "num-tweets" : numTweets, "baseline" : baseline } ) + "\n")
    f.flush()

f.close()"""
import json
from main.util.db import openDb
from scipy import stats
from main.util.common import SimpleDoc, powerset, Log
import math
import itertools

expertTopics = list(map(lambda s: s.strip(), open("data/expert_topics", "r")))

l = Log(filename="foo", verbose=True)

docs = map(lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)], SimpleDoc.getallBetween((2012,6), (2012,8)))

baseline = { }

for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
    for doc in docs:
        numTweets = len(doc[0])
        metricScore = doc[1][ind]
        pairs.append([numTweets, metricScore])

    x, y = zip(*pairs)
    s, p = stats.spearmanr(x, y)

    baseline[metricName] = s

count = 0
count2 = 0
for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
def canBeEncoded(text):
    try:
        str(text)
        return True
    except UnicodeEncodeError:
        return False

def tweetsBetweenDay(documents, lowerBound, upperBound):
    return [[tweet.text, tweet.timestamp, tweet.username, doc.doi, doc.title, doc.publicationTimestamp] for doc in documents for tweet in doc.tweets 
        if 
            ((lowerBound*60*60*24) <= (tweet.timestamp - doc.publicationTimestamp) <= (upperBound*60*60*24)) and
            canBeEncoded(tweet.text) and
            canBeEncoded(doc.title)
    ]

relevantDocuments = SimpleDoc.getallBetween((2012, 6), (2012, 8))

tweets = []
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 0, 1), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 1, 3), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 3, 5), 111))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 7, 30), 333))
tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 100, 300), 333))

tweetTexts = map(lambda tweetdata: "\t".join([str(tweetdata[0]), str(tweetdata[1]), tweetdata[2], tweetdata[3], tweetdata[4], str(tweetdata[5])]), tweets)
random.shuffle(tweetTexts)

f = open("tweetTexts_1.txt", "w")
for text in tweetTexts[0:333]:
    f.write(text.replace("\n", " ").replace("\"", "").replace("'", "") + "\n")
f.close()
    valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call),
        SimpleDoc.getallBetween(lowerBound, None)
    ))

    minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric)
    print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std]))
"""

cat = "Biological Sciences"
"""consideredDocs = filter(
        lambda doc: 
            doc.mendeleyDisciplines != None and cat in doc.mendeleyDisciplines, 
            SimpleDoc.getallBetween((2012,6), (2012,8))
    )"""
consideredDocs = SimpleDoc.getallBetween((2012,6), (2012,8))
print len(consideredDocs)
matrix = getAttributeValueMatrix(consideredDocs, calls)
corrs = correlationBetweenEverything(matrix, attributeNames)

"""f = open("foo", "w")
for corr in corrs:
    f.write(corr.toJson() + "\n")
f.close()"""

# corrs = CorrelationItem.fromFile("stuff/pairwise_corr_2012-6_2012-8.json")

f = open("foo", "w")
m = []
for a1 in attributeNames:
    row = []