#!/usr/bin/env python
# coding: utf-8
#
# Author: Peinan ZHANG
# Created at: 2014-10-24

import sys
from q010_sort_second_wFreq import sortSecWFreq
from collections import defaultdict

def calcBigramFreq(lines, unigram_freq):
  bigram_freq = {}
  for line in lines:
    count, c_word, n_word = line.strip().decode('utf-8').split('\t')
    bigram_freq['%s\t%s' % (c_word, n_word)] = \
        float(count) / unigram_freq[c_word]
  return bigram_freq


if __name__ == '__main__':
  unigram_freq = sortSecWFreq('data/medline.txt.sent.tok')
  bigram_freq  = calcBigramFreq(sys.stdin.readlines(), unigram_freq)
  for k, v in sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True):
    sys.stdout.write('%f\t%s\n' % (v, k.encode('utf-8')))
#!/usr/bin/env python
# coding: utf-8
#
# Author: Peinan ZHANG
# Created at: 2014-10-23

import sys

def topN(freqDict, N=100, c=True):
  from q010_sort_second_wFreq import sortSecWFreq
  count = 0
  return_list = []
  for k, v in sorted(freqDict.items(), key=lambda x: x[1], reverse=True):
    if count < N:
      if c == True:
        return_list.append('%3d %s\n' % (v, k.encode('utf-8')))
      if c == False:
        return_list.append('%s\n' % k.encode('utf-8'))
      count += 1
    else:
      break
  return return_list

if __name__ == '__main__':
  for item in topN(sortSecWFreq(sys.argv[1])):
    sys.stdout.write(item)

#!/usr/bin/env python
# coding: utf-8
#
# Author: Peinan ZHANG
# Created at: 2014-10-23

import sys, os
from q027_top100 import topN
from q010_sort_second_wFreq import sortSecWFreq

def mkBigramFile(filepath):
  with open('bigram.temp', 'w') as bigramFile:
    for line in open(filepath):
      line = line.strip().decode('utf-8')
      if len(line) <= 2:
        bigramFile.write(line.encode('utf-8') + '\n')
        continue
      for i in range(len(line) - 1):
        bigramFile.write('%s%s\n' % \
            (line[i].encode('utf-8'), line[i + 1].encode('utf-8')))

if __name__ == '__main__':
  mkBigramFile(sys.argv[1])
  for item in topN(sortSecWFreq('bigram.temp')):
    sys.stdout.write(item)
  os.remove('bigram.temp')